This notebook performs a basic overview of the data and analyses the genetic diversity indicators. It uses as input the “clean kobo output” that was first cleaned by 1.2_cleaning.

Get data and functions

Load required libraries:

library(tidyr)
library(dplyr)
library(utile.tools)
library(stringr)
library(ggplot2)
library(ggsankey)
library(alluvial)
library(viridis)
library(cowplot)

Load required functions. These custom fuctions are available at: https://github.com/AliciaMstt/GeneticIndicators

source("get_indicator1_data.R")
source("get_indicator2_data.R")
source("get_indicator3_data.R")
source("get_metadata.R")
source("transform_to_Ne.R")
source("estimate_indicator1.R")

Other custom functions:

# to imitate ggplot colors, thanks to https://stackoverflow.com/questions/8197559/emulate-ggplot2-default-color-palette
gg_color_hue <- function(n) {
  hues = seq(15, 375, length = n + 1)
  hcl(h = hues, l = 65, c = 100)[1:n]
}

# not in
'%!in%' <- function(x,y)!('%in%'(x,y))

Custom colors:

## IUCN official colors
IUCNcolors<-c("brown2", "darkgrey", "darkorange", "darkgreen", "bisque1", "green", "azure2", "yellow")

## nice soft ramp for taxonomic groups
taxoncolors<-cividis(12) # same than using cividis(length(levels(as.factor(metadata$taxonomic_group))))

## Colors for simplified methods to define populations 
# assuming the following (see below, it wont work here): levels(as.factor(ind2_data$defined_populations_simplified))
# [1] "adaptive_traits management_units"         "eco_biogeo_proxies"                       "genetic_clusters"                        
# [4] "genetic_clusters eco_biogeo_proxies"      "genetic_clusters geographic_boundaries"   "geographic_boundaries"                   
# [7] "geographic_boundaries adaptive_traits"    "geographic_boundaries eco_biogeo_proxies" "geographic_boundaries management_units"  
# [10] "low_freq_combinations"                    "management_units"                         "other"  
# get a set of colors to highlight genetic and geographic with similar colors

simplifiedmethods_colors<-c("#b34656", #"adaptive_traits management_units"
                            "#7f611b", # "eco_biogeo_proxies"
                            "#668cd1", # "genetic_clusters"     
                            "#668cd1", # "genetic_clusters eco_biogeo_proxies"     
                            "#45c097", # "genetic_clusters geographic_boundaries"  
                            "#d4b43e", # "geographic_boundaries"                   
                            "#d4b43e", # "geographic_boundaries adaptive_traits"
                            "#d4b43e", # "geographic_boundaries eco_biogeo_proxies"
                            "#d4b43e", # "geographic_boundaries management_units"  
                            "#be72c9", # "low_freq_combinations" 
                            "#b93921", # "management_units" 
                            "#d868a2")# "other" 

Get indicators data from clean kobo output

# Get data:
kobo_clean<-read.csv(file="kobo_output_clean.csv", header=TRUE)

# Extract indicator 1 data from kobo output, show most relevant columns
ind1_data<-get_indicator1_data(kobo_output=kobo_clean)
## [1] "the data already contained a taxon column, that was used instead of creating a new one"
head(ind1_data[,c(1:3, 12:14)])
# Extract indicator 2 data from kobo output, show most relevant columns
ind2_data<-get_indicator2_data(kobo_output=kobo_clean)
## [1] "the data already contained a taxon column, that was used instead of creating a new one"
head(ind2_data[,c(1:3, 9:10,13)])
# Extract indicator 3 data from kobo output, show most relevant columns
ind3_data<-get_indicator3_data(kobo_output=kobo_clean)
## [1] "the data already contained a taxon column, that was used instead of creating a new one"
head(ind3_data[,c(1:3, 9:11)])
# extract metadata, show most relevant columns
metadata<-get_metadata(kobo_output=kobo_clean)
## [1] "the data already contained a taxon column, that was used instead of creating a new one"
head(metadata[,c(1:3, 12, 25,26, 64)])
# save processed data
write.csv(ind1_data, "ind1_data.csv", row.names = FALSE, fileEncoding = "UTF-8")
write.csv(ind2_data, "ind2_data.csv", row.names = FALSE, fileEncoding = "UTF-8")
write.csv(ind3_data, "ind3_data.csv", row.names = FALSE, fileEncoding = "UTF-8")
write.csv(metadata, "metadata.csv", row.names = FALSE, fileEncoding = "UTF-8")

General description of the dataset

Methods to define populations

The methods used to define populations come from a check box question were one or more of the following categories can be selected: genetic_clusters, geographic_boundaries, eco_biogeo_proxies, adaptive_traits, management_units, other. As a consequence any combination of the former can be possible. Leading to the following results:

table(ind2_data$defined_populations)
## 
##                                                                            adaptive_traits 
##                                                                                          5 
##                                                           adaptive_traits management_units 
##                                                                                         20 
##                                                                         eco_biogeo_proxies 
##                                                                                         41 
##                                                         eco_biogeo_proxies adaptive_traits 
##                                                                                          2 
##                                                        eco_biogeo_proxies management_units 
##                                                                                         10 
##                                                                   eco_biogeo_proxies other 
##                                                                                          2 
##                                                                           genetic_clusters 
##                                                                                        107 
##                                                           genetic_clusters adaptive_traits 
##                                                                                          7 
##                                                        genetic_clusters eco_biogeo_proxies 
##                                                                                         20 
##                                        genetic_clusters eco_biogeo_proxies adaptive_traits 
##                                                                                          3 
##                       genetic_clusters eco_biogeo_proxies adaptive_traits management_units 
##                                                                                          2 
##                                       genetic_clusters eco_biogeo_proxies management_units 
##                                                                                          1 
##                                                     genetic_clusters geographic_boundaries 
##                                                                                         74 
##                                     genetic_clusters geographic_boundaries adaptive_traits 
##                                                                                          5 
##                                  genetic_clusters geographic_boundaries eco_biogeo_proxies 
##                                                                                          7 
##                  genetic_clusters geographic_boundaries eco_biogeo_proxies adaptive_traits 
##                                                                                          1 
## genetic_clusters geographic_boundaries eco_biogeo_proxies adaptive_traits management_units 
##                                                                                          1 
##                 genetic_clusters geographic_boundaries eco_biogeo_proxies management_units 
##                                                                                          1 
##                                    genetic_clusters geographic_boundaries management_units 
##                                                                                          8 
##                                                          genetic_clusters management_units 
##                                                                                         11 
##                                                                     genetic_clusters other 
##                                                                                          2 
##                                                                      geographic_boundaries 
##                                                                                        276 
##                                                      geographic_boundaries adaptive_traits 
##                                                                                         32 
##                                     geographic_boundaries adaptive_traits management_units 
##                                                                                         12 
##                               geographic_boundaries adaptive_traits management_units other 
##                                                                                          1 
##                                                   geographic_boundaries eco_biogeo_proxies 
##                                                                                         75 
##                                   geographic_boundaries eco_biogeo_proxies adaptive_traits 
##                                                                                          3 
##                                  geographic_boundaries eco_biogeo_proxies management_units 
##                                                                                          3 
##                                             geographic_boundaries eco_biogeo_proxies other 
##                                                                                          2 
##                                                     geographic_boundaries management_units 
##                                                                                         25 
##                                                                geographic_boundaries other 
##                                                                                         12 
##                                                                           management_units 
##                                                                                        132 
##                                                                     management_units other 
##                                                                                          2 
##                                                                                      other 
##                                                                                         19

It is hard to group the above methods, so we will keep the original groups with n >=19 in the above list, and tag the combinations that appear few times as as “low_freq_combinations”.

Which groups have n>=19?

x<-as.data.frame(table(ind2_data$defined_populations)[table(ind2_data$defined_populations) >= 19])
colnames(x)[1]<-"method"

x
## (CONSIDER ADDING THIS CHUNK TO THE GET_ FUNCTIONS?)
### for ind2_data
ind2_data<- ind2_data %>% 
  mutate(defined_populations_simplified = case_when(
         # if the method is in the list of methods n>=19 then keep it
         defined_populations %in% x$method ~ defined_populations,
         TRUE ~ "low_freq_combinations"))


### for meta
metadata<- metadata %>% 
  mutate(defined_populations_simplified = case_when(
         # if the method is in the list of methods n>=19 then keep it
         defined_populations %in% x$method ~ defined_populations,
         TRUE ~ "low_freq_combinations"))

Check n for simplified methods:

table(ind2_data$defined_populations_simplified)
## 
##         adaptive_traits management_units 
##                                       20 
##                       eco_biogeo_proxies 
##                                       41 
##                         genetic_clusters 
##                                      107 
##      genetic_clusters eco_biogeo_proxies 
##                                       20 
##   genetic_clusters geographic_boundaries 
##                                       74 
##                    geographic_boundaries 
##                                      276 
##    geographic_boundaries adaptive_traits 
##                                       32 
## geographic_boundaries eco_biogeo_proxies 
##                                       75 
##   geographic_boundaries management_units 
##                                       25 
##                    low_freq_combinations 
##                                      103 
##                         management_units 
##                                      132 
##                                    other 
##                                       19

Another option is to highlight if genetic_cluster or geographic_boundaries were used at all, which are the main drivers. This will look like:

### for ind2_data
#ind2_data<- ind2_data %>% 
 # mutate(defined_populations_geneticgeo = case_when(
      #### PENDIENTE USE grepl to show TRUE for geo, gen, geo AND gen, others.
    #))

Table of equivalences:

ind2_data %>% 
       select(defined_populations, defined_populations_simplified) %>% 
       filter(!duplicated(defined_populations))

Total number of taxa and taxa assessed more than once.

Records by country, including taxa assessed more than once (see below for details on this)

ggplot(metadata, aes(x=country_assessment)) + 
  geom_bar(stat = "count") +
  ggtitle("Number of taxa assessedd by country, \nincluding duplicated taxa")

Did countries used kobo or tabular?

ggplot(metadata, aes(x=country_assessment, fill=kobo_tabular)) + 
  geom_bar(stat = "count") +
  ggtitle("Number of taxa assessedd by country, \nincluding duplicated taxa")

Records by taxonomic groups

ggplot(metadata, aes(x=taxonomic_group)) + 
  geom_bar(stat = "count") +
  theme(axis.text.x = element_text(angle = 45)) +
  ggtitle("Number of taxa assessedd by taxonomic group, \nincluding duplicated taxa")

Some taxa were assessed twice, for example to account for uncertainty on how to divide populations. This information is stored in variable multiassessment of the metadata (created by get_metadata()). An example of taxa with multiple assessments:

metadata %>%
filter(multiassessment=="multiassessment")  %>%
  select(taxonomic_group, taxon, country_assessment, multiassessment) %>%
  arrange(taxon, country_assessment) %>%
  head()

In total these are the number or records (assessment) done for both categories:

table(metadata$multiassessment)
## 
##   multiassessment single_assessment 
##                93               831

The above numbers refer to the number or records, if what we want is to know how many taxa were analysed for each category, then:

Number of taxa with multiple submissions:

multi_taxa<- metadata %>%
                filter(multiassessment=="multiassessment") %>%
                select(taxon, country_assessment, taxonomic_group) %>%
                unique()
# how many?
nrow(multi_taxa)
## [1] 46

Number of taxa with single submissions:

single_taxa<- metadata %>%
                filter(multiassessment=="single_assessment") %>%
                select(taxon, country_assessment, taxonomic_group) %>%
                unique()
# how many?
nrow(single_taxa)
## [1] 831

To explore what kind of taxa countries assessed regardless of if they assessed them once or more, lets create a dataset keeping all single assessed taxa, plus only the first assessment for taxa assessed multiple times.

# object with single assessed taxa, plus only the first assessment for taxa assessed multiple times
metadata_firstmulti<-metadata[!duplicated(cbind(metadata$taxon, metadata$country_assessment)), ]

How many records?

# how many?
nrow(metadata_firstmulti)
## [1] 877

Of which countries and taxonomic groups are the taxa that were assessed more than once?

metadata_firstmulti %>% # we use the _firstmulti dataset so that multiassesed records are counted only once
        filter(multiassessment=="multiassessment") %>%
      ggplot(aes(x=country_assessment)) + 
        geom_bar(stat = "count") +
        ggtitle("Number of taxa assessed more than once, by country")

metadata_firstmulti %>% # we use the _unique dataset so that multiassesed records are counted only once
        filter(multiassessment=="multiassessment") %>%

ggplot(aes(x=taxonomic_group, fill=country_assessment)) + 
  geom_bar(stat = "count") +
  theme(axis.text.x = element_text(angle = 45)) +
  ggtitle("Number of taxa assessed more than once")

Now check taxa assessed excluding duplicates, i.e. the real number of taxa assessed. This will be used in downstream analyses

ggplot(metadata_firstmulti, aes(x=country_assessment)) + 
  geom_bar(stat = "count") +
  ggtitle("Number of taxa assessed by country")

ggplot(metadata_firstmulti, aes(x=taxonomic_group)) + 
  geom_bar(stat = "count") +
  theme(axis.text.x = element_text(angle = 45)) +
  ggtitle("Number of taxa assessed by taxonomic group")

Sankey and alluvial fun

Note: The following plots in this section consider only one record of the taxa that were assessed more than once. That is a total of 877 taxa.

Which taxonomic groups are countries assessing?

Note on alluvial vs Sankey, taken from ggalluvial: An important feature of alluvial plots is the meaningfulness of the vertical axis: No gaps are inserted between the strata, so the total height of the plot reflects the cumulative quantity of the observations. The plots produced by {ggalluvial} conform to the “grammar of graphics” principles of {ggplot2}, and this prevents users from producing “free-floating” visualizations like the Sankey diagrams

Using alluvial:

library(alluvial)
# estimate frequencies for alluvial plot
foralluvial_1<-metadata_firstmulti %>% group_by(country_assessment, taxonomic_group) %>%
             summarise(n=n())
head(foralluvial_1)
## plot
# define colors
my_cols<- gg_color_hue(length(unique(foralluvial_1$country_assessment)))

# we need a vector of colors by country for each row of the dataset, so:
countries<-as.factor(foralluvial_1$country_assessment)
levels(countries)<-my_cols
countries<-as.vector(countries)
head(countries)
## [1] "#F8766D" "#F8766D" "#F8766D" "#F8766D" "#F8766D" "#F8766D"
# plot
alluvial(foralluvial_1[,1:2], freq = foralluvial_1$n,
         col=countries, 
         blocks=FALSE,
         gap.width = 0.3,
         cex=.8, 
         xw = 0.2,
         cw = 0.1,
         border = NA)

Using ggsankey

library(ggsankey)

# transform data to how ggsankey wants it
df <- metadata_firstmulti %>%
  make_long(country_assessment, taxonomic_group)

# Sankey plot
ggplot(df, aes(x = x,
               next_x = next_x,
               node = node,
               next_node = next_node,
               fill = factor(node),
               label = node)) +
  geom_sankey(flow.alpha = 0.6, 
              node.color = NA,
              show.legend = FALSE) +
  geom_sankey_label(size = 2, color = 1, fill = "white") +
  # colour by country
  scale_fill_manual(values=c(scales::hue_pal()(length(unique(metadata$country_assessment))),  # a nice  color for each of the n countries
                             rep("darkgrey", length(unique(metadata$taxonomic_group)))), # grey for the n taxonomic groups
                    breaks=c(unique(metadata$country_assessment), 
                             unique(metadata$taxonomic_group))) +
  theme_sankey(base_size = 10) +
  xlab("")

Ne / Nc data across countries and taxa?

Using alluvial:

library(alluvial)
# estimate frequencies for alluvial plot
foralluvial_1<-metadata_firstmulti %>% group_by(country_assessment, taxonomic_group, popsize_data) %>%
             summarise(n=n())
head(foralluvial_1)
## plot
# define colors
my_cols<- gg_color_hue(length(unique(foralluvial_1$country_assessment)))

# we need a vector of colors by country for each row of the dataset, so:
countries<-as.factor(foralluvial_1$country_assessment)
levels(countries)<-my_cols
countries<-as.vector(countries)
head(countries)
## [1] "#F8766D" "#F8766D" "#F8766D" "#F8766D" "#F8766D" "#F8766D"
# plot
alluvial(foralluvial_1[,1:3], freq = foralluvial_1$n,
         col=countries, 
         blocks=FALSE,
         gap.width = 0.5,
         cex=.8, 
         xw = 0.1,
         cw = 0.2,
         border = NA)

Using ggsankey option 1

# transform data to how ggsankey wants it
df <- metadata_firstmulti %>%
  make_long(country_assessment, taxonomic_group, popsize_data)

# Sankey
ggplot(df, aes(x = x,
               next_x = next_x,
               node = node,
               next_node = next_node,
               fill = factor(node),
               label = node)) +
  geom_sankey(flow.alpha = 0.6, 
              show.legend = FALSE) +
  # manually set flow fill according to countries and popsize 
                      # a nice  color for each of the n countries for the first (left) part
  scale_fill_manual(values=c(scales::hue_pal()(length(unique(metadata_firstmulti$country_assessment))),  
                             # grey for the taxonomic groups
                              rep("darkgrey", length(unique(metadata_firstmulti$taxonomic_group))),
                             # three colors for popsize_data
                             c("darkolivegreen", "darkgoldenrod1", "brown3")), 
                    breaks=c(unique(metadata_firstmulti$country_assessment), 
                             unique(metadata_firstmulti$taxonomic_group),
                             unique(metadata_firstmulti$popsize_data))) +

  geom_sankey_label(size = 2, color = 1, fill = "white") +
  theme_sankey(base_size = 10) +
  xlab("")

Using ggsankey option 2

# transform data to how ggsankey wants it
df <- metadata_firstmulti %>%
  make_long(country_assessment, taxonomic_group, popsize_data)

# Sankey
ggplot(df, aes(x = x,
               next_x = next_x,
               node = node,
               next_node = next_node,
               fill = factor(node),
               label = node)) +
  geom_sankey(flow.alpha = 0.6, 
              show.legend = FALSE) +
  # manually set flow fill according to countries and popsize 
                      # a nice  color for each of the n countries for the first (left) part
  scale_fill_manual(values=c(scales::hue_pal()(length(unique(metadata_firstmulti$country_assessment))),  
                             # gradient of soft colors for taxonomic groups
                             taxoncolors,
                             # traffic light for pop data
                             c("darkolivegreen", "darkgoldenrod1", "brown3")), 
                    breaks=c(unique(metadata_firstmulti$country_assessment), 
                             levels(as.factor(metadata_firstmulti$taxonomic_group)),
                             unique(metadata_firstmulti$popsize_data))) +

  geom_sankey_label(size = 2.5, color = "black", fill = "white") +
  theme_sankey(base_size = 10) +
  xlab("")

Bar plots

ggplot(metadata, aes(x=taxonomic_group, fill=popsize_data)) + 
  geom_bar(stat = "count") +
  coord_flip() +
  facet_wrap(~country_assessment, ncol = 4) +
  scale_fill_manual(values=c("#1f77b4", "grey80", "#2ca02c"),
                    breaks=c(levels(as.factor(metadata$popsize_data)))) +
  theme_light()

Taxonomic groups and IUCN status

Using alluvial: (pending to fix a NA in Cucurbita argyrosperma)

library(alluvial)
# estimate frequencies for alluvial plot
foralluvial_1<-metadata_firstmulti %>% group_by(taxonomic_group, global_IUCN, popsize_data) %>%
             summarise(n=n())

## plot
# define colors according to IUCN categories
# check order of categories:
levels(as.factor(metadata_firstmulti$global_IUCN))

# create vector of colors accordingly:
my_cols<-IUCNcolors


# we need a vector of colors by iucn for each row of the dataset, so:
gIUCN<-as.factor(foralluvial_1$global_IUCN)
levels(gIUCN)<-my_cols
gIUCN<-as.vector(gIUCN)
head(gIUCN)

# plot
alluvial(foralluvial_1[,1:3], freq = foralluvial_1$n,
         col=gIUCN, 
         blocks="bookends",
         alpha = 0.6,
         gap.width = 0.4,
         cex=.75, 
         xw = 0.15,
         cw = 0.2,
         border = NA)

Using ggsankey

# transform data to how ggsankey wants it
df <- metadata %>%
  make_long(taxonomic_group, global_IUCN, popsize_data)


# Sankey
ggplot(df, aes(x = x,
               next_x = next_x,
               node = node,
               next_node = next_node,
               fill = factor(node),
               label = node)) +
  geom_sankey(flow.alpha = 0.6, 
              node.color = NA,
              show.legend = FALSE) +
  
    # manually set flow fill 
                       # gradient of soft colors for taxonomic groups
  scale_fill_manual(values= c(taxoncolors,
                       # iucn color codes
                           IUCNcolors,
                     # traffic light for pop data
                           c("darkolivegreen", "darkgoldenrod1", "brown3")),
                    breaks=c(levels(as.factor(metadata_firstmulti$taxonomic_group)), 
                             levels(as.factor(metadata_firstmulti$global_IUCN)),
                             unique(metadata_firstmulti$popsize_data))) +

  geom_sankey_label(size = 2, color = 1, fill = "white") +
  theme_sankey(base_size = 10) +
  xlab("")

Method to define populations

The following plots consider the whole dataset, ie including taxa that were assessed more than once (because they could have been analysed using different methods to define populations)

# reformat data
foralluvial<-metadata %>% group_by(country_assessment, defined_populations_simplified, taxonomic_group) %>%
             summarise(n=n())

## plot
# define colors
my_cols<- simplifiedmethods_colors

# we need a vector of colors by country for each row of the dataset, so:
methodspop<-as.factor(foralluvial$defined_populations_simplified)
levels(methodspop)<-my_cols
methodspop<-as.vector(methodspop)
head(methodspop)
## [1] "#668cd1" "#668cd1" "#668cd1" "#668cd1" "#668cd1" "#45c097"
# plot
alluvial(foralluvial[,1:3], freq = foralluvial$n,
         col=methodspop, 
         blocks=FALSE,
         gap.width = 0.5,
         cex=.8, 
         xw = 0.1,
         cw = 0.2,
         border = NA,
         alpha = .7)

Same only country and method:

# plot
alluvial(foralluvial[,1:2], freq = foralluvial$n,
         col=methodspop, 
         blocks=FALSE,
         gap.width = 0.5,
         cex=.8, 
         xw = 0.1,
         cw = 0.2,
         border = NA,
          alpha = .7)

Sankey just becasue why not:

# transform data to how ggsankey wants it
df <- metadata %>%
  make_long(country_assessment, defined_populations_simplified, taxonomic_group)

# Sankey plot
ggplot(df, aes(x = x,
               next_x = next_x,
               node = node,
               next_node = next_node,
               fill = factor(node),
               label = node)) +
  geom_sankey(flow.alpha = 0.6, 
              node.color = NA,
              show.legend = FALSE) +
  geom_sankey_label(size = 2, color = 1, fill = "white") +
  # colour by country
                              # a nice  color for each of the n countries
  scale_fill_manual(values=c(scales::hue_pal()(length(unique(metadata$country_assessment))),  
                             rep("darkgrey", length(unique(metadata$defined_populations_simplified))),
                             taxoncolors), 
                    breaks=c(unique(metadata$country_assessment), 
                             levels(as.factor(ind2_data$defined_populations_simplified)),
                             levels(as.factor(ind2_data$taxonomic_group)))) +
  theme_sankey(base_size = 10) +
  xlab("")

Indicator 1 (populations Ne > 500)

Remember population size data could be obtained by different means

Population size data may come from different methods for each population within a single taxon. For example, some populations can have Ne estimates, other Nc and others a range. Examples:

ind1_data %>%
  filter(taxon=="Alces alces") %>% 
  select(country_assessment, taxon, population, Name, Ne, NcRange)
ind1_data %>%
  filter(taxon=="Alces alces") %>% 
  select(country_assessment, taxon, population, Name, NcPoint, NcRange)

Also, for some taxa there may be population size data for some populations, but not all. Therefore indicator 1 would be computed with less populations than the total number of populations. Example (see pop3, 4, 13, 15):

ind1_data %>%
  filter(taxon=="Juniperus monticola") %>% 
  select(country_assessment, taxon, population, Name, Ne, NcPoint, NcRange)

We need to keep the former in mind for interpretation and discussion of how the indicator can change in future assessments if data becomes available for populations currently missing.

How many of the 2549 populations have Ne, Nc or range data?

Ne?

sum(!is.na(ind1_data$Ne))
## [1] 174

Nc point?

sum(!is.na(ind1_data$NcPoint))
## [1] 636

Nc range?

sum(!is.na(ind1_data$NcRange))
## [1] 1460

Ne data by population

ind1_data %>%
  ggplot(aes(x=country_assessment, fill=is.na(Ne)))+
  geom_bar(position = "dodge") +
  coord_flip() + 
  scale_fill_manual(labels=c("no", "yes"),
                      breaks=c("TRUE", "FALSE"),
                      values=c("brown", "darkgreen")) + 
  labs(fill="Population has Ne data?") +
  xlab("") +
  theme_bw() +
  theme(text = element_text(size = 17), legend.position = "bottom", panel.border = element_blank())

Nc data type by population

ind1_data %>%
  filter(!is.na(NcType)) %>%
  ggplot(aes(x=country_assessment, fill=NcType))+
  geom_bar(position = "dodge") +
  coord_flip() + 
  scale_fill_discrete(labels=c("Nc_point", "Nc_range", "Unknown Nc for this population"),
                     breaks=c("Nc_point", "Nc_range", "unknown")) +
  xlab("") +
  theme_bw() +
  theme(text = element_text(size = 17), legend.position = "bottom", panel.border = element_blank()) +
  labs(fill="") +
  ggtitle("Type of Nc data by population")

By taxa

Nc data availalbe by taxa?

metadata %>%
  filter(!is.na(nc_pops_exists)) %>%
  ggplot(aes(x=country_assessment, fill=nc_pops_exists))+
  geom_bar() +
  coord_flip() + 
  scale_fill_manual(values=c("brown", "darkgreen")) + 
  labs(fill="Has Nc data for indicator 1?") +
  xlab("") +
  ggtitle("Has Nc data?") +
  theme_bw() +
  theme(text = element_text(size = 17), legend.position = "bottom", panel.border = element_blank())

Ne data and genetic data available?

metadata %>% 
  filter(!is.na(ne_pops_exists)) %>% 
    ggplot(aes(x=country_assessment, fill=ne_pops_exists)) + 
  geom_bar(position = "dodge") +
scale_fill_manual(labels=c("no", "yes", "no, but has genetic data"),
                      breaks=c("no_genetic_data", "ne_available", "other_genetic_info"),
                      values=c("brown", "darkgreen", "grey")) +
  coord_flip()  +
theme_bw() +
theme(text = element_text(size = 17), legend.position = "bottom", panel.border = element_blank()) +
ggtitle("Has Ne data?")

Ne data yes or not?

metadata %>% 
  filter(!is.na(ne_pops_exists)) %>% 
  filter(ne_pops_exists!="other_genetic_info") %>%
    ggplot(aes(x=country_assessment, fill=ne_pops_exists)) + 
  geom_bar() +
scale_fill_manual(labels=c("no", "yes"),
                      breaks=c("no_genetic_data", "ne_available"),
                      values=c("brown", "darkgreen")) +
  coord_flip()  +
xlab("") +
labs(fill="Has Ne data for indicator 1?")  +
theme_bw() +
theme(text = element_text(size = 17), legend.position = "bottom", panel.border = element_blank()) +
ggtitle("Has Ne data?")

Ne values

How is Ne data distributed?

summary(ind1_data$Ne)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
##      1.9     41.0    150.0   8782.2    639.0 500000.0     2375

Boxplot of Ne values:

ind1_data %>%
  ggplot(aes(x=country_assessment, y=Ne)) +
  geom_boxplot()

Check outliers (Ne very high):

ind1_data %>% 
  filter(Ne > 100000) %>%
  select(country_assessment, name_assessor, taxon, taxonomic_group, Ne, NeLower, NeUpper, multiassessment, population)

Boxplot filtering outliers (Ne)

ind1_data %>% filter(Ne < 100000) %>%
  ggplot(aes(x=country_assessment, y=Ne)) +
  geom_boxplot()

Estimate indicator 1

Get function to transform NcRange and NcPoint data into Ne.

# check what the custom funciton does
transform_to_Ne
## function (ind1_data) 
## {
##     ind1_data = ind1_data
##     ind1_data <- ind1_data %>% mutate(Nc_from_range = case_when(NcRange == 
##         "more_5000_bymuch" ~ 5001, NcRange == "more_5000" ~ 5001, 
##         NcRange == "less_5000_bymuch" ~ 100, NcRange == "less_5000" ~ 
##             100, NcRange == "range_includes_5000" ~ 5001)) %>% 
##         mutate(Ne_from_Nc = case_when(!is.na(NcPoint) ~ NcPoint * 
##             0.1, !is.na(Nc_from_range) ~ Nc_from_range * 0.1)) %>% 
##         mutate(Ne_combined = if_else(is.na(Ne), Ne_from_Nc, Ne))
##     print(ind1_data)
## }

Use function to get Ne data from NcRange or NcPoint data, and their combination (Ne estimated from Ne if Ne is available, otherwise, from Nc)

ind1_data<-transform_to_Ne(ind1_data = ind1_data) 
## # A tibble: 2,549 × 38
##    country_assessme… taxonomic_group taxon scientific_auth… genus year_assesment
##    <chr>             <chr>           <chr> <chr>            <chr> <chr>         
##  1 sweden            mammal          Alce… (Linnaeus, 1758) Alces 2023          
##  2 sweden            mammal          Alce… (Linnaeus, 1758) Alces 2023          
##  3 sweden            mammal          Alce… (Linnaeus, 1758) Alces 2023          
##  4 sweden            fish            Silu… (Linnaeus, 1758) Silu… 2023          
##  5 sweden            fish            Silu… (Linnaeus, 1758) Silu… 2023          
##  6 sweden            fish            Silu… (Linnaeus, 1758) Silu… 2023          
##  7 sweden            fish            Silu… (Linnaeus, 1758) Silu… 2023          
##  8 sweden            fish            Silu… (Linnaeus, 1758) Silu… 2023          
##  9 sweden            fish            Silu… (Linnaeus, 1758) Silu… 2023          
## 10 sweden            bird            Dend… Bechstein 1803   Dend… 2022          
## # … with 2,539 more rows, and 32 more variables: name_assessor <chr>,
## #   email_assessor <chr>, kobo_tabular <chr>, time_populations <chr>,
## #   X_validation_status <chr>, X_uuid <chr>, multiassessment <chr>,
## #   population <chr>, Name <chr>, Origin <chr>, IntroductionYear <chr>,
## #   Ne <dbl>, NeLower <dbl>, NeUpper <dbl>, NeYear <chr>, GeneticMarkers <chr>,
## #   GeneticMarkersOther <chr>, MethodNe <chr>, SourceNe <chr>, NcType <chr>,
## #   NcYear <chr>, NcMethod <chr>, NcRange <chr>, NcRangeDetails <chr>, …
ind1_data %>%
  select(taxon, population, Ne, NcPoint, NcRange, Ne_from_Nc, Ne_combined)

Get function to estimate indicator 1

# check what the custom funciton does
estimate_indicator1
## function (ind1_data) 
## {
##     indicator1 <- ind1_data %>% group_by(X_uuid, ) %>% summarise(n_pops = n(), 
##         n_pops_Ne_data = sum(!is.na(Ne_combined)), n_pops_more_500 = sum(Ne_combined > 
##             500, na.rm = TRUE), indicator1 = n_pops_more_500/n_pops_Ne_data) %>% 
##         left_join(metadata)
##     print(indicator1)
## }

Now estimate indicator 1 :)

indicator1<-estimate_indicator1(ind1_data = ind1_data)
## # A tibble: 547 × 70
##    X_uuid      n_pops n_pops_Ne_data n_pops_more_500 indicator1 country_assessm…
##    <chr>        <int>          <int>           <int>      <dbl> <chr>           
##  1 010d85cd-5…      2              1               1          1 united_states   
##  2 016d59ae-9…      1              1               0          0 mexico          
##  3 019bd95f-b…      1              1               0          0 sweden          
##  4 01b10b29-9…      1              1               1          1 south_africa    
##  5 0301e6b3-b…      3              3               3          1 france          
##  6 037d6c8f-7…      4              2               2          1 united_states   
##  7 03f03179-1…      1              1               1          1 south_africa    
##  8 0586b61e-7…     12             12               0          0 belgium         
##  9 065a53ba-0…      1              1               0          0 south_africa    
## 10 06e6bb50-3…      1              1               0          0 belgium         
## # … with 537 more rows, and 64 more variables: taxonomic_group <chr>,
## #   taxon <chr>, scientific_authority <chr>, genus <chr>, year_assesment <chr>,
## #   name_assessor <chr>, email_assessor <chr>, common_name <chr>,
## #   kobo_tabular <chr>, X_validation_status <chr>, GBIF_taxonID <int>,
## #   NCBI_taxonID <chr>, national_taxonID <chr>, source_national_taxonID <chr>,
## #   other_populations <chr>, time_populations <chr>, defined_populations <chr>,
## #   source_definition_populations <chr>, map_populations <chr>, …
indicator1

Plots indicator 1

By country

# get sample size by desired category

sample_size <- indicator1 %>%
                    group_by(country_assessment) %>% summarize(num=n())

# plot
indicator1 %>% 
  # add sampling size 
  left_join(sample_size) %>%
  mutate(myaxis = paste0(country_assessment, " (n= ", num, ")")) %>%

  # plot
  ggplot(aes(x=myaxis, y=indicator1, fill=country_assessment)) +
  geom_violin(width= 1, linewidth = 0)  +
  geom_jitter(size=.5, width = 0.1) +
  xlab("") +
  ylab("indicator 1 (proportion of populations Ne > 500)") +
  coord_flip() +
  theme_bw() +
  theme(panel.border = element_blank(), legend.position="none", text = element_text(size = 20))

By taxonomic group

# get sample size by desired category

sample_size <- indicator1 %>%
                    group_by(taxonomic_group) %>% summarize(num=n())

# plot
indicator1 %>%
  # add sampling size
  left_join(sample_size) %>%
  mutate(myaxis = paste0(taxonomic_group, " (n= ", num, ")")) %>%

  #plot
ggplot(aes(x=myaxis, y=indicator1, fill=taxonomic_group), color=NA) +
      geom_violin(width=1, linewidth = 0)  +
      geom_jitter(size=.5, width = 0.1, color="brown") +
      xlab("") +
      coord_flip() +
      scale_fill_manual(values= taxoncolors,
                    breaks=c(levels(as.factor(indicator1$taxonomic_group)))) +
      theme_bw() +
      theme(panel.border = element_blank(), legend.position="none", text = element_text(size = 15))

By country and taxonomic group

indicator1 %>%
ggplot(aes(x=taxonomic_group, y=indicator1, fill=taxonomic_group), color=NA) +
      geom_violin(width=1, linewidth = 0)  +
      geom_jitter(size=.5, width = 0.1, color="brown") +
      xlab("") +
      ylab("Indicator 1 (proportion of populations Ne > 500") +
      coord_flip() +
      scale_fill_manual(values= taxoncolors,
                    breaks=c(levels(as.factor(indicator1 $taxonomic_group)))) +
      theme_bw() +
      theme(panel.border = element_blank(), legend.position="none", text = element_text(size = 14)) +
      facet_wrap(~country_assessment, ncol = 5) +
       theme(panel.spacing = unit(1.5, "lines"))

By IUCN

 # get sample size by desired category

sample_size <- indicator1 %>%
                    group_by(global_IUCN) %>% summarize(num=n())

# plot
indicator1 %>% 
  # add sampling size 
  left_join(sample_size) %>%
  mutate(myaxis = paste0(global_IUCN, " (n= ", num, ")")) %>%

  # plot
  ggplot(aes(x=myaxis, y=indicator1, fill=global_IUCN)) +
      geom_violin(width=.7, linewidth = 0)  +
      geom_jitter(size=.5, width = 0.1) +
      xlab("") +
      coord_flip() +
      scale_fill_manual(values= IUCNcolors, # iucn color codes
                    breaks=c(levels(as.factor(indicator1$global_IUCN)))) +
      theme_bw() +
      theme(panel.border = element_blank(), legend.position="none", text= element_text(size=15)) +
      theme(panel.spacing = unit(1.5, "lines"))

By IUCN and country

# get sample size by desired category
# plot
indicator1 %>% 
  # plot
  ggplot(aes(x=global_IUCN, y=indicator1, fill=global_IUCN)) +
      geom_violin(width=.7, linewidth = 0)  +
      geom_jitter(size=.5, width = 0.1) +
      xlab("") +
      coord_flip() +
      scale_fill_manual(values= IUCNcolors, # iucn color codes
                    breaks=c(levels(as.factor(indicator1$global_IUCN)))) +
      theme_bw() +
      theme(panel.border = element_blank(), legend.position="none", text= element_text(size=15)) +
      facet_wrap(~country_assessment, ncol = 5) +
      theme(panel.spacing = unit(1.5, "lines"))

By range

# get sample size by desired category
sample_size <- indicator1  %>%
                    filter(!is.na(indicator1)) %>% 
                    group_by(species_range) %>% summarize(num=n())

# plot
indicator1  %>% 
  # add sampling size 
  left_join(sample_size) %>%
  mutate(myaxis = paste0(species_range, " (n= ", num, ")")) %>%

  # plot
  ggplot(aes(x=myaxis, y=indicator1 , fill=species_range)) +
      geom_violin(width=1, linewidth = 0)  +
      geom_jitter(size=.5, width = 0.1) +
      xlab("") +
      coord_flip() +
      theme_bw() +
      theme(panel.border = element_blank(), legend.position="none", text= element_text(size=20))

By range and country

# plot
indicator1  %>% 
  # add sampling size 
  left_join(sample_size) %>%
  mutate(myaxis = paste0(species_range, " (n= ", num, ")")) %>%

  # plot
  ggplot(aes(x=species_range, y=indicator1 , fill=species_range)) +
      geom_violin(width=1, linewidth = 0)  +
      geom_jitter(size=.5, width = 0.1) +
      xlab("") +
      coord_flip() +
      theme_bw() +
      theme(panel.border = element_blank(), legend.position="none", text= element_text(size=20)) +
      facet_wrap(~country_assessment, ncol = 5) +
      theme(panel.spacing = unit(1.5, "lines"))

Indicator 2 (proportion of populations within species which are maintained)

Indicator 2 is the he proportion of populations within species which are maintained. This can be estimated based on the n_extant_populations and n_extint_populations, as follows:

ind2_data$indicator2<- ind2_data$n_extant_populations / (ind2_data$n_extant_populations + ind2_data$n_extint_populations)
head(ind2_data$indicator2)
## [1] 1.0000000 0.5000000 0.2941176 1.0000000 0.3333333 1.0000000

Number of extant populations.

See the distribution of the number of extant populations:

ind2_data %>% 
  ggplot(aes(x=n_extant_populations)) +
  geom_histogram() + 
  ggtitle("histogram of extant populations")

Which taxa have more than 100 populations?

ind2_data %>% 
  filter(n_extant_populations>100) %>%
  select(taxon, country_assessment, n_extant_populations)

Exclude outliers (>200 populations)

ind2_data %>% 
  filter(n_extant_populations<200) %>%
  ggplot(aes(x=n_extant_populations)) +
  geom_histogram() + 
  ggtitle("Distribution of extant populations \nexcluding outliers (>200 pops)")

How does the number of populations vary by country? (excluding outliers: >200 pops)

ind2_data %>% 
  filter(n_extant_populations<200) %>%
  ggplot(aes(x=country_assessment, y=n_extant_populations)) +
          geom_boxplot(aes(color=country_assessment)) +
  coord_flip() +
  theme_bw() +
  theme(panel.border = element_blank(), legend.position="none") + 
  ggtitle("Distribution of extant populations by country, \nexcluding outliers (>200 pops)")

And by method to define populations? (excluding outliers: >200 pops)

ind2_data %>% 
  filter(n_extant_populations<200) %>%
  ggplot(aes(x=defined_populations, y=n_extant_populations)) +
          geom_boxplot() +
  coord_flip() +
  theme_bw() +
  theme(panel.border = element_blank()) + 
  ggtitle("Distribution of extant populations by pop definition method, \nexcluding outliers (>200 pops)")

Simplified method categories for easier visualization:

ind2_data %>% 
  filter(n_extant_populations<200) %>%
  ggplot(aes(x=defined_populations_simplified, y=n_extant_populations, color=defined_populations_simplified)) +
          geom_boxplot() +
  coord_flip() +
  theme_bw() +
  theme(panel.border = element_blank(), legend.position="none") + 
  ggtitle("Distribution of extant populations by pop definition method, \nexcluding outliers (>200 pops)") +
  scale_color_manual(values=simplifiedmethods_colors,
                    breaks=levels(as.factor(ind2_data$defined_populations_simplified)))

Number of populations by taxonomic group:

ind2_data %>% 
  filter(n_extant_populations<200) %>%
  ggplot(aes(x=taxonomic_group, y=n_extant_populations, color=taxonomic_group)) +
          geom_boxplot() +
  coord_flip() +
  theme_bw() +
  theme(panel.border = element_blank()) + 
  ggtitle("Distribution of extant populations by taxonomic group, \nexcluding outliers (>200 pops)") +
  scale_color_manual(values= taxoncolors,
                    breaks=levels(levels(as.factor(ind2_data$taxonomic_group))))

Taxonomic group and method:

ind2_data %>% 
  filter(n_extant_populations<200) %>%
  ggplot(aes(x=taxonomic_group, y=n_extant_populations, color=taxonomic_group)) +
          geom_boxplot() +
          geom_jitter(size=.3, width = 0.1, color="darkred") +
  coord_flip() +
  theme_bw() +
  theme(panel.border = element_blank()) + 
  ggtitle("Distribution of extant populations by taxonomic group and method to define pops, excluding outliers (>200 pops)") +
  scale_color_manual(values= taxoncolors,
                    breaks=levels(levels(as.factor(ind2_data$taxonomic_group)))) +
  facet_wrap(defined_populations_simplified ~ .) +
  xlab("")

Country and method:

ind2_data %>% 
  filter(n_extant_populations<200) %>%
  ggplot(aes(x=defined_populations_simplified, y=n_extant_populations, color=defined_populations_simplified)) +
          geom_boxplot() +
          geom_jitter(size=.3, width = 0.1, color="black") +
  coord_flip() +
  theme_bw() +
  theme(panel.border = element_blank(), legend.position="none") + 
  ggtitle("Distribution of extant populations by country and \nmethod to define pops, excluding outliers (>200 pops)") +
  facet_wrap(country_assessment ~ ., nrow=2) +
  xlab("")  +
  scale_color_manual(values=simplifiedmethods_colors,
                    breaks=levels(as.factor(ind2_data$defined_populations_simplified)))

Country and method, but with the US and Sweden in different scale

## excluding Sweden and US
p1<-ind2_data %>% 
  filter(n_extant_populations<50) %>%
  filter(country_assessment!="sweden", country_assessment!="united_states") %>%
  
  # plot
  ggplot(aes(x=defined_populations_simplified, y=n_extant_populations, color=defined_populations_simplified)) +
          geom_boxplot() +
          geom_jitter(size=.3, width = 0.1, color="black") +
  coord_flip() +
  theme_bw() + ylab("") +
  theme(panel.border = element_blank(), legend.position="none") + 
  ggtitle("Distribution of extant populations by country and method to define pops, excluding outliers (>200 pops)") +
  facet_wrap(country_assessment ~ ., nrow=2) +
  xlab("")  +
  scale_color_manual(values=simplifiedmethods_colors,
                    breaks=levels(as.factor(ind2_data$defined_populations_simplified)))

## With Sweden and US
p2<-ind2_data %>% 
  filter(n_extant_populations<200) %>%
  filter(country_assessment %in% c("sweden", "united_states")) %>%
  
  # plot
  ggplot(aes(x=defined_populations_simplified, y=n_extant_populations, color=defined_populations_simplified)) +
          geom_boxplot() +
          geom_jitter(size=.3, width = 0.1, color="black") +
  coord_flip() +
  theme_bw() + xlab("") + ylab("Number of extant populations") +
  theme(panel.border = element_blank(), legend.position="none") + 
  facet_wrap(country_assessment ~ ., nrow=1, ncol=2) +
  scale_color_manual(values=simplifiedmethods_colors,
                    breaks=levels(as.factor(ind2_data$defined_populations_simplified)))
  ## plot together
plot_grid(p1, p2, nrow=2, rel_heights = c(2,1))

Number of populations by taxonomic group and range type:

# add species range metadata
ind2_data$species_range <- metadata$species_range

ind2_data %>% 
  filter(n_extant_populations<200) %>%
  ggplot(aes(x=taxonomic_group, y=n_extant_populations, color=species_range)) +
          geom_boxplot() +
          geom_jitter(size=.3, position = position_jitterdodge()) +
  coord_flip() +
  theme_bw() +
  theme(panel.border = element_blank()) + 
  ggtitle("Distribution of extant populations by taxonomic group and range, \nexcluding outliers (>200 pops)")

Number of populations by taxonomic group and global IUCN:

# add species range metadata
ind2_data$global_IUCN <- metadata$global_IUCN

ind2_data %>% 
  filter(n_extant_populations<200) %>%
  ggplot(aes(x=taxonomic_group, y=n_extant_populations, col=global_IUCN)) +
          geom_boxplot() +
  coord_flip() +
  theme_bw() +
  theme(panel.border = element_blank()) + 
  ggtitle("Distribution of extant populations by taxonomic group and global IUCN, \nexcluding outliers (>200 pops)") +
  scale_color_manual(values= IUCNcolors, # iucn color codes
                    breaks=c(levels(as.factor(ind2_data$global_IUCN))))

ANOVAS on the number of extant populations

One-way ANOVA for the effect of the method to define populations on the number of extant pops, removing the extreme outlier (>1,000 pops)

# subset data without massive outlier
ind2_data_anova<- ind2_data %>% 
                         filter(n_extant_populations<1000)

# summary of n per variable
table(ind2_data_anova$defined_populations_simplified)
## 
##         adaptive_traits management_units 
##                                       19 
##                       eco_biogeo_proxies 
##                                       41 
##                         genetic_clusters 
##                                      105 
##      genetic_clusters eco_biogeo_proxies 
##                                       19 
##   genetic_clusters geographic_boundaries 
##                                       74 
##                    geographic_boundaries 
##                                      272 
##    geographic_boundaries adaptive_traits 
##                                       32 
## geographic_boundaries eco_biogeo_proxies 
##                                       74 
##   geographic_boundaries management_units 
##                                       25 
##                    low_freq_combinations 
##                                      101 
##                         management_units 
##                                      127 
##                                    other 
##                                       13
# One way ANOVA method
res.anova.extant<-aov(n_extant_populations ~ defined_populations_simplified, data=ind2_data_anova)
res.anova.extant
## Call:
##    aov(formula = n_extant_populations ~ defined_populations_simplified, 
##     data = ind2_data_anova)
## 
## Terms:
##                 defined_populations_simplified Residuals
## Sum of Squares                         34724.1 1798790.6
## Deg. of Freedom                             11       890
## 
## Residual standard error: 44.95679
## Estimated effects may be unbalanced
summary(res.anova.extant)
##                                 Df  Sum Sq Mean Sq F value Pr(>F)
## defined_populations_simplified  11   34724    3157   1.562  0.105
## Residuals                      890 1798791    2021

Same One-way ANOVA for the effect of the method to define populations on the number of extant pops, but removing outliers >200 pops

# subset data without outliers
ind2_data_anova<- ind2_data %>% 
                         filter(n_extant_populations<200)

# summary of n per variable
table(ind2_data_anova$defined_populations_simplified)
## 
##         adaptive_traits management_units 
##                                       19 
##                       eco_biogeo_proxies 
##                                       40 
##                         genetic_clusters 
##                                      105 
##      genetic_clusters eco_biogeo_proxies 
##                                       19 
##   genetic_clusters geographic_boundaries 
##                                       72 
##                    geographic_boundaries 
##                                      268 
##    geographic_boundaries adaptive_traits 
##                                       32 
## geographic_boundaries eco_biogeo_proxies 
##                                       73 
##   geographic_boundaries management_units 
##                                       25 
##                    low_freq_combinations 
##                                      100 
##                         management_units 
##                                      126 
##                                    other 
##                                       13
# One way ANOVA method
res.anova.extant<-aov(n_extant_populations ~ defined_populations_simplified, data=ind2_data_anova)
res.anova.extant
## Call:
##    aov(formula = n_extant_populations ~ defined_populations_simplified, 
##     data = ind2_data_anova)
## 
## Terms:
##                 defined_populations_simplified Residuals
## Sum of Squares                         17687.1  426965.7
## Deg. of Freedom                             11       880
## 
## Residual standard error: 22.02699
## Estimated effects may be unbalanced
summary(res.anova.extant)
##                                 Df Sum Sq Mean Sq F value   Pr(>F)    
## defined_populations_simplified  11  17687  1607.9   3.314 0.000176 ***
## Residuals                      880 426966   485.2                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Since there significant differences, check which pairs are actually different (Tukey Honest Significant Differences):

# only padj < 0.05
x<-TukeyHSD(res.anova.extant)
x<-as.data.frame(x[[1]])
x[x$`p adj`<0.05, ]

One-way ANOVA for the effect of the country on the number of extant pops, removing outliers >200 pops

# subset data without outliers
ind2_data_anova<- ind2_data %>% 
                         filter(n_extant_populations<200)

# summary of n per variable
table(ind2_data_anova$country_assessment)
## 
##     australia       belgium      colombia        france         japan 
##            85           111            47            71            50 
##        mexico  south_africa        sweden united_states 
##            95           121           121           191
# One way ANOVA method
res.anova.extant<-aov(n_extant_populations ~ country_assessment, data=ind2_data_anova)
res.anova.extant
## Call:
##    aov(formula = n_extant_populations ~ country_assessment, data = ind2_data_anova)
## 
## Terms:
##                 country_assessment Residuals
## Sum of Squares             36921.2  407731.6
## Deg. of Freedom                  8       883
## 
## Residual standard error: 21.48854
## Estimated effects may be unbalanced
summary(res.anova.extant)
##                     Df Sum Sq Mean Sq F value   Pr(>F)    
## country_assessment   8  36921    4615   9.995 2.15e-13 ***
## Residuals          883 407732     462                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Since there significant differences, check which pairs are actually different (Tukey Honest Significant Differences):

# only padj < 0.05
x<-TukeyHSD(res.anova.extant)
x<-as.data.frame(x[[1]])
x[x$`p adj`<0.05, ]

One-way ANOVA for the effect of the taxonomic group on the number of extant pops, removing outliers >200 pops and taxonomic groups with too few data

# summary of n per variable
table(ind2_data$taxonomic_group)
## 
##     amphibian    angiosperm          bird     bryophyte          fish 
##            63           233           140             5            70 
##        fungus    gymnosperm  invertebrate        mammal         other 
##             3            19           148           140            17 
## pteridophytes       reptile 
##            14            72
# subset data 
ind2_data_anova<- ind2_data %>% 
                         filter(n_extant_populations<200) %>% 
                         filter(taxonomic_group %!in% c("fungus", "bryophyte", "other", "pteridophytes"))

# summary of n per variable
table(ind2_data_anova$taxonomic_group)
## 
##    amphibian   angiosperm         bird         fish   gymnosperm invertebrate 
##           60          229          139           69           18          142 
##       mammal      reptile 
##          135           62
# One way ANOVA method
res.anova.extant<-aov(n_extant_populations ~ taxonomic_group, data=ind2_data_anova)
res.anova.extant
## Call:
##    aov(formula = n_extant_populations ~ taxonomic_group, data = ind2_data_anova)
## 
## Terms:
##                 taxonomic_group Residuals
## Sum of Squares          23890.3  417880.0
## Deg. of Freedom               7       846
## 
## Residual standard error: 22.22494
## Estimated effects may be unbalanced
summary(res.anova.extant)
##                  Df Sum Sq Mean Sq F value   Pr(>F)    
## taxonomic_group   7  23890    3413   6.909 5.17e-08 ***
## Residuals       846 417880     494                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Since there significant differences, check which pairs are actually different (Tukey Honest Significant Differences):

# only padj < 0.05
x<-TukeyHSD(res.anova.extant)
x<-as.data.frame(x[[1]])
x[x$`p adj`<0.05, ]

Two-way ANOVA with interaction effect of the method to define populations and the country, removing outliers >200 pops Question: is interaction he correct model? or should it be additive?

# subset data without outliers
ind2_data_anova<- ind2_data %>% 
                         filter(n_extant_populations<200)

# summary of n per variable
ind2_data_anova %>%
  group_by(country_assessment, defined_populations_simplified) %>% summarise(n=n())
# Two-way ANOVA with interaction effect of the method to define populations and the country
res.anova.extant<-aov(n_extant_populations ~ defined_populations_simplified * country_assessment , data=ind2_data_anova)
res.anova.extant
## Call:
##    aov(formula = n_extant_populations ~ defined_populations_simplified * 
##     country_assessment, data = ind2_data_anova)
## 
## Terms:
##                 defined_populations_simplified country_assessment
## Sum of Squares                         17687.1            26099.2
## Deg. of Freedom                             11                  8
##                 defined_populations_simplified:country_assessment Residuals
## Sum of Squares                                            26752.3  374114.1
## Deg. of Freedom                                                46       826
## 
## Residual standard error: 21.28198
## 42 out of 108 effects not estimable
## Estimated effects may be unbalanced
summary(res.anova.extant)
##                                                    Df Sum Sq Mean Sq F value
## defined_populations_simplified                     11  17687    1608   3.550
## country_assessment                                  8  26099    3262   7.203
## defined_populations_simplified:country_assessment  46  26752     582   1.284
## Residuals                                         826 374114     453        
##                                                     Pr(>F)    
## defined_populations_simplified                    6.79e-05 ***
## country_assessment                                2.95e-09 ***
## defined_populations_simplified:country_assessment    0.101    
## Residuals                                                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Since there significant differences, check which pairs are actually different (Tukey Honest Significant Differences):

# only padj < 0.05
x<-TukeyHSD(res.anova.extant)
x<-as.data.frame(x[[1]])
x[x$`p adj`<0.05, ]

Two-way ANOVA with interaction effect of the method to define populations and the country, but keeping only groups with enough n

# variables with enough n
enough_n<-ind2_data %>%
              group_by(country_assessment, defined_populations_simplified) %>% 
              summarise(n=n()) %>% 
              filter(n>=15)
## `summarise()` has grouped output by 'country_assessment'. You can override
## using the `.groups` argument.
# subset data without outliers and with enough n
ind2_data_anova<- ind2_data %>% 
                  filter(n_extant_populations<200) %>% 
                                # this gives the country
                  filter(country_assessment==unique(enough_n$country_assessment)[1] & 
                                 #this gives the methods for that country (the last [[1]] is to get the results out of a list)
                  defined_populations_simplified %in% enough_n[enough_n$country_assessment==unique(enough_n$country_assessment)[1], 2][[1]]  | 
                                
                  # the same for rest of countries. Notice the use of & for methods within country and | to change to other country
                  country_assessment==unique(enough_n$country_assessment)[2] & 
                  defined_populations_simplified %in% enough_n[enough_n$country_assessment==unique(enough_n$country_assessment)[2], 2][[1]] | 
                    
                  country_assessment==unique(enough_n$country_assessment)[3] & 
                  defined_populations_simplified %in% enough_n[enough_n$country_assessment==unique(enough_n$country_assessment)[3], 2][[1]] |
                    
                  country_assessment==unique(enough_n$country_assessment)[4] & 
                  defined_populations_simplified %in% enough_n[enough_n$country_assessment==unique(enough_n$country_assessment)[4], 2][[1]] | 
                    
                  country_assessment==unique(enough_n$country_assessment)[5] & 
                  defined_populations_simplified %in% enough_n[enough_n$country_assessment==unique(enough_n$country_assessment)[5], 2][[1]] | 
                    
                  country_assessment==unique(enough_n$country_assessment)[6] & 
                  defined_populations_simplified %in% enough_n[enough_n$country_assessment==unique(enough_n$country_assessment)[6], 2][[1]] | 
                    
                  country_assessment==unique(enough_n$country_assessment)[7] & 
                  defined_populations_simplified %in% enough_n[enough_n$country_assessment==unique(enough_n$country_assessment)[7], 2][[1]] | 
                    
                  country_assessment==unique(enough_n$country_assessment)[8] & 
                  defined_populations_simplified %in% enough_n[enough_n$country_assessment==unique(enough_n$country_assessment)[8], 2][[1]])

# summary of n per variable
ind2_data_anova %>%
  group_by(country_assessment, defined_populations_simplified) %>% summarise(n=n())
## `summarise()` has grouped output by 'country_assessment'. You can override
## using the `.groups` argument.
# Two-way ANOVA with interaction effect of the method to define populations and the country
res.anova.extant<-aov(n_extant_populations ~ defined_populations_simplified * country_assessment , data=ind2_data_anova)
res.anova.extant
## Call:
##    aov(formula = n_extant_populations ~ defined_populations_simplified * 
##     country_assessment, data = ind2_data_anova)
## 
## Terms:
##                 defined_populations_simplified country_assessment
## Sum of Squares                         5564.08            5584.38
## Deg. of Freedom                              7                  4
##                 defined_populations_simplified:country_assessment Residuals
## Sum of Squares                                             328.40 138849.53
## Deg. of Freedom                                                 3       509
## 
## Residual standard error: 16.51632
## 49 out of 64 effects not estimable
## Estimated effects may be unbalanced
summary(res.anova.extant)
##                                                    Df Sum Sq Mean Sq F value
## defined_populations_simplified                      7   5564   794.9   2.914
## country_assessment                                  4   5584  1396.1   5.118
## defined_populations_simplified:country_assessment   3    328   109.5   0.401
## Residuals                                         509 138850   272.8        
##                                                     Pr(>F)    
## defined_populations_simplified                    0.005365 ** 
## country_assessment                                0.000475 ***
## defined_populations_simplified:country_assessment 0.752137    
## Residuals                                                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Since there significant differences, check which pairs are actually different (Tukey Honest Significant Differences):

# only padj < 0.05
x<-TukeyHSD(res.anova.extant)
x<-as.data.frame(x[[1]])
x[x$`p adj`<0.05, ]

Number of extinct populations.

See the distribution of the number of extinct populations:

ind2_data %>% 
  ggplot(aes(x=n_extint_populations)) +
  geom_histogram() + 
  ggtitle("histogram of extinct populations")

Exclude outliers (>200 populations)

ind2_data %>% 
  filter(n_extint_populations<200) %>%
  ggplot(aes(x=n_extint_populations)) +
  geom_histogram() + 
  ggtitle("Distribution of extinct populations \nexcluding outliers (>200 pops)")

How does the number of populations vary by country? (excluding outliers: >200 pops)

ind2_data %>% 
  filter(n_extint_populations<200) %>%
  ggplot(aes(x=country_assessment, y=n_extint_populations)) +
  geom_boxplot(aes(color=country_assessment)) +
  coord_flip() +
  theme_bw() +
  theme(panel.border = element_blank(), legend.position="none") + 
  ggtitle("Distribution of extinct populations by country, \nexcluding outliers (>200 pops)")

by method to define populations? (excluding outliers: >200 pops) with simplified method categories for easier visualization:

ind2_data %>% 
  filter(n_extint_populations<200) %>%
  ggplot(aes(x=defined_populations_simplified, y=n_extint_populations, color=defined_populations_simplified)) +
  geom_boxplot() +
  coord_flip() +
  theme_bw() +
  theme(panel.border = element_blank(), legend.position="none") + 
  ggtitle("Distribution of extinct populations by pop definition method, \nexcluding outliers (>200 pops)") +
  scale_color_manual(values=simplifiedmethods_colors,
                     breaks=levels(as.factor(ind2_data$defined_populations_simplified)))

Number of populations by taxonomic group:

ind2_data %>% 
  filter(n_extint_populations<200) %>%
  ggplot(aes(x=taxonomic_group, y=n_extint_populations, color=taxonomic_group)) +
  geom_boxplot() +
  coord_flip() +
  theme_bw() +
  theme(panel.border = element_blank()) + 
  ggtitle("Distribution of extinct populations by taxonomic group, \nexcluding outliers (>200 pops)") +
  scale_color_manual(values= taxoncolors,
                     breaks=levels(levels(as.factor(ind2_data$taxonomic_group))))

Taxonomic group and method:

ind2_data %>% 
  filter(n_extint_populations<200) %>%
  ggplot(aes(x=taxonomic_group, y=n_extint_populations, color=taxonomic_group)) +
  geom_boxplot() +
  coord_flip() +
  theme_bw() +
  theme(panel.border = element_blank()) + 
  ggtitle("Distribution of extinct populations by taxonomic group and \nmethod to define pops, excluding outliers (>200 pops)") +
  scale_color_manual(values= taxoncolors,
                     breaks=levels(levels(as.factor(ind2_data$taxonomic_group)))) +
  facet_wrap(defined_populations_simplified ~ .) +
  xlab("")

Country and method:

ind2_data %>% 
  filter(n_extint_populations<200) %>%
  ggplot(aes(x=defined_populations_simplified, y=n_extint_populations, color=defined_populations_simplified)) +
  geom_boxplot() +
  coord_flip() +
  theme_bw() +
  theme(panel.border = element_blank(), legend.position="none") + 
  ggtitle("Distribution of extinct populations by country and \nmethod to define pops, excluding outliers (>200 pops)") +
  facet_wrap(country_assessment ~ ., nrow=2) +
  xlab("")  +
  scale_color_manual(values=simplifiedmethods_colors,
                     breaks=levels(as.factor(ind2_data$defined_populations_simplified)))

Number of populations by taxonomic group and range type:

# add species range metadata
ind2_data$species_range <- metadata$species_range

ind2_data %>% 
  filter(n_extint_populations<200) %>%
  ggplot(aes(x=taxonomic_group, y=n_extint_populations, color=species_range)) +
  geom_boxplot() +
  coord_flip() +
  theme_bw() +
  theme(panel.border = element_blank()) + 
  ggtitle("Distribution of extinct populations by taxonomic group and range, \nexcluding outliers (>200 pops)")

Number of populations by taxonomic group and global IUCN:

# add species range metadata
ind2_data$global_IUCN <- metadata$global_IUCN

ind2_data %>% 
  filter(n_extint_populations<200) %>%
  ggplot(aes(x=taxonomic_group, y=n_extint_populations, col=global_IUCN)) +
  geom_boxplot() +
  coord_flip() +
  theme_bw() +
  theme(panel.border = element_blank()) + 
  ggtitle("Distribution of extinct populations by taxonomic group and global IUCN, \nexcluding outliers (>200 pops)") +
  scale_color_manual(values= IUCNcolors, # iucn color codes
                     breaks=c(levels(as.factor(ind2_data$global_IUCN))))

Numberof extant vs extinct number of populations

By method

ind2_data %>%
  # filter outliers with too many pops
  filter(n_extant_populations<200) %>%
  
  # plot
    ggplot(aes(x=n_extant_populations, y=n_extint_populations, color=defined_populations_simplified)) +
    geom_point() +
    theme_bw() +
    scale_color_manual(values=simplifiedmethods_colors,
                    breaks=levels(as.factor(ind2_data$defined_populations_simplified))) 

By method. Sweden and US separately because they have too many pops.

## excluding Sweden and US

p1 <- ind2_data %>%
  # filter outliers with too many pops and desired countries
  filter(n_extant_populations<50) %>%
  filter(country_assessment!="sweden", country_assessment!="united_states") %>%
  
  # plot
    ggplot(aes(x=n_extant_populations, y=n_extint_populations, color=defined_populations_simplified)) +
    geom_point() + theme_bw() + xlab("") +
    scale_color_manual(values=simplifiedmethods_colors,
                    breaks=levels(as.factor(ind2_data$defined_populations_simplified))) + facet_wrap(country_assessment ~ ., nrow=2) +
   theme(panel.border = element_blank(), legend.position="none")

## With Sweden and US
p2<-ind2_data %>%
  # filter outliers with too many pops and desired countries
  filter(n_extant_populations<200) %>%
  filter(country_assessment %in% c("sweden", "united_states")) %>%
  
  # plot
    ggplot(aes(x=n_extant_populations, y=n_extint_populations, color=defined_populations_simplified)) +
    geom_point() + theme_bw() +
    scale_color_manual(values=simplifiedmethods_colors,
                    breaks=levels(as.factor(ind2_data$defined_populations_simplified))) + facet_wrap(country_assessment ~ ., nrow=1, ncol=2) +
   theme(panel.border = element_blank(), legend.position="bottom")

## plot together
plot_grid(p1, p2, nrow=2, rel_heights = c(1.2,1))

By risk status, zooming in to fewer n of pops. Sweden and US separately because they have too many pops.

## excluding Sweden and US

p1 <- ind2_data %>%
  # filter outliers with too many pops and desired countries
  filter(n_extant_populations<50) %>%
  filter(country_assessment!="sweden", country_assessment!="united_states") %>%
  
  # plot
    ggplot(aes(x=n_extant_populations, y=n_extint_populations, color=global_IUCN)) +
    geom_point() + theme_bw() + xlab("") +
    scale_color_manual(values=IUCNcolors,
                    breaks=levels(as.factor(ind2_data$global_IUCN))) + facet_wrap(country_assessment ~ ., nrow=2) +
   theme(panel.border = element_blank(), legend.position="none")

## With Sweden and US
p2<-ind2_data %>%
  # filter outliers with too many pops and desired countries
  filter(n_extant_populations<200) %>%
  filter(country_assessment %in% c("sweden", "united_states")) %>%
  
  # plot
    ggplot(aes(x=n_extant_populations, y=n_extint_populations, color=global_IUCN)) +
    geom_point() + theme_bw() +
    scale_color_manual(values=IUCNcolors,
                    breaks=levels(as.factor(ind2_data$global_IUCN))) + facet_wrap(country_assessment ~ ., nrow=1, ncol=2) +
   theme(panel.border = element_blank(), legend.position="bottom")

## plot together
plot_grid(p1, p2, nrow=2, rel_heights = c(1.2,1))

Distribution of NA in indicator 2

We have NA because in some cases the number of extinct populations is unknown, therefore the above operation cannot be computed.

Total records with NA in extant populations:

sum(is.na(ind2_data$n_extant_populations))
## [1] 21

Which are?

ind2_data %>%
  filter(is.na(n_extant_populations)) %>%
    select(country_assessment, taxonomic_group, taxon, n_extant_populations, n_extint_populations)

Total taxa with NA in extinct populations:

sum(is.na(ind2_data$n_extint_populations))
## [1] 359

Do taxa with NA for extant also have NA for extinct?

ind2_data$taxon[is.na(ind2_data$n_extant_populations)] %in% ind2_data$taxon[is.na(ind2_data$n_extint_populations)]
##  [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE TRUE TRUE TRUE TRUE

So out of the 924, we have 359 records with NA in n_extinct and 21 records with NA in n_extant. Of them, 21 have NA in both n_extant and n_extinct.

QUESTION: should we manually check that NA are correct in both extinct or extant pops? (the cleaning script only chekcs for 0, not NAs)

So in total there are 359 records where there are NA in either n_extant or n_extinct, which is 38.85% of total number of records. Therefore when estimating indicator 2… QUESTION: What should we do: A) we can’t estimate indicator 2 in those species, or B) we assume n_extinct = NA = 0, and therefore indicator 2 = 1.

Plots!

By country

ind2_data %>%
  ggplot(aes(x=country_assessment, fill=is.na(indicator2)))+
  geom_bar(position = "dodge") +
  coord_flip() + 
  scale_fill_discrete(labels=c("number of population known", "has missing data")) + 
  labs(fill="Indicator 2 data") +
  xlab("")

By taxonomic group

ind2_data %>%
  ggplot(aes(x=taxonomic_group, fill=is.na(indicator2)))+
  geom_bar(position = "dodge") +
  coord_flip()+ 
  scale_fill_discrete(labels=c("number of populations known", "has missing data")) + 
  labs(fill="Indicator 2 data") +
  xlab("")

By method to define pops

ind2_data %>%
  ggplot(aes(x=defined_populations_simplified, fill=is.na(indicator2)))+
  geom_bar(position = "dodge") +
  coord_flip()+ 
  scale_fill_discrete(labels=c("number of populations known", "has missing data")) + 
  labs(fill="Indicator 2 data") +
  xlab("")

By method to define pops and country

ind2_data %>%
  ggplot(aes(x=defined_populations_simplified, fill=is.na(indicator2)))+
  geom_bar(position = "dodge") +
  coord_flip()+ 
  scale_fill_discrete(labels=c("number of populations known", "has missing data")) + 
  labs(fill="Indicator 2 data") +
  xlab("") + facet_wrap(country_assessment ~., nrow = 2) +
  theme(panel.border = element_blank(), legend.position="bottom")

By taxonomic group and country

ind2_data %>%
  ggplot(aes(x=taxonomic_group, fill=is.na(indicator2)))+
  geom_bar(position = "dodge") +
  coord_flip()+ 
  scale_fill_discrete(labels=c("number of populations known", "has missing data")) + 
  labs(fill="Indicator 2 data") +
  xlab("") + facet_wrap(country_assessment ~., nrow = 2) +
  theme(panel.border = element_blank(), legend.position="right")

Distribution of multiassessments and single assesments

Some taxa were assessed more than once to account for, for example, different ways in how to delimit populations. Create a subset of them, excluding those records with missing data in indicator2 (due to missing data in n_pops).

#subset only with taxa assessed multiple times:
ind2_data_multi<-ind2_data %>% 
                          filter(multiassessment=="multiassessment") 

In total there are 93 multiassessed records, of 44 taxa. Notice that this can include missing data in the number of populations, hence not allowing to estimate indicator 2.

To be able to visualize the missing data, the following plot changes NA to -1. Variation in the number of extant populations by assessment

# plot
ind2_data_multi %>% 
  mutate(n_extant_populations=ifelse(is.na(n_extant_populations), -1, n_extant_populations)) %>%
  ggplot(aes(x=taxon, y=n_extant_populations)) +
          geom_line(colour="darkgrey") + 
          geom_point(aes(color=country_assessment)) +
  xlab("") +
  coord_flip() +
  ggtitle("Variation in the number of extant populations, \nfor multiassessed taxa") +
  theme_bw() + 
  theme(legend.position = "bottom")

Same plot, but excluding Bombus terricola’s massive variation:

# plot
ind2_data_multi %>% 
  filter(taxon!="Bombus terricola") %>% 
  mutate(n_extant_populations=ifelse(is.na(n_extant_populations), -1, n_extant_populations)) %>%
  ggplot(aes(x=taxon, y=n_extant_populations)) +
          geom_line(colour="darkgrey") + 
          geom_point(aes(color=country_assessment)) +
  xlab("") +
  coord_flip() +
  ggtitle("Variation in the number of extant populations, \nfor multiassessed taxa") +
  theme_bw() + 
  theme(legend.position = "bottom")

Now for extinct populations (NA transformed as -1 for visualization purposes):

# plot
ind2_data_multi %>% 
  mutate(n_extint_populations=ifelse(is.na(n_extint_populations), -1, n_extint_populations)) %>%
  ggplot(aes(x=taxon, y=n_extint_populations)) +
          geom_line(colour="darkgrey") + 
          geom_point(aes(color=country_assessment)) +
  xlab("") +
  coord_flip() +
  ggtitle("Variation in the number of extinct populations, \nfor multiassessed taxa") +
  theme_bw() + 
  theme(legend.position = "bottom")

See you later, Bombus terricola

# plot
ind2_data_multi %>% 
  filter(taxon!="Bombus terricola") %>%
  mutate(n_extint_populations=ifelse(is.na(n_extint_populations), -1, n_extint_populations)) %>%
  ggplot(aes(x=taxon, y=n_extint_populations)) +
          geom_line(colour="darkgrey") + 
          geom_point(aes(color=country_assessment)) +
  xlab("") +
  coord_flip() +
  ggtitle("Variation in the number of extinct populations, \nfor multiassessed taxa") +
  theme_bw() + 
  theme(legend.position = "bottom")

This is how much the values of indicator2 vary within mutliassessed taxa (taxa names with no shown values mean they have missing data in the number of populations and hence indicator 2 can’t be estimated):

# plot
ind2_data_multi %>% 
  ggplot(aes(x=taxon, y=indicator2)) +
          geom_line(colour="darkgrey") + 
          geom_point(aes(color=country_assessment)) +
  xlab("") +
  coord_flip() +
  ggtitle("Variation in indicator 2, \nfor multiassessed taxa") +
  theme_bw() + 
  theme(legend.position = "bottom")

For exploratory purposes, unless otherwise stated differently, the analyses below will use a subset of the data including only taxa assessed a single time, plus the first record of those assessed multiple times.

#subset keeping only taxa assessed a single time, plust the first record of those assessed multiple times.
ind2_data_firstmulti<-ind2_data[!duplicated(cbind(ind2_data$taxon, ind2_data$country_assessment)), ]

Indicator 2 values distribution

Remember, for exploratory purposes, unless otherwise stated differently, the analyses below will use a subset of the data including only taxa assessed a single time, plus the first record of those assessed multiple times.

For the taxa that do have data, this is how the values of indicator2 are distributed:

hist(ind2_data_firstmulti$indicator2)

Visualizing by country

# get sample size by desired category

sample_size <- ind2_data_firstmulti %>%
                    filter(!is.na(indicator2)) %>% 
                    group_by(country_assessment) %>% summarize(num=n())

# plot
ind2_data_firstmulti %>% 
  # add sampling size 
  left_join(sample_size) %>%
  mutate(myaxis = paste0(country_assessment, " (n= ", num, ")")) %>%

  # plot
  ggplot(aes(x=myaxis, y=indicator2, fill=country_assessment)) +
  geom_violin(width= 1, linewidth = 0)  +
  geom_jitter(size=.5, width = 0.1) +
  xlab("") +
  coord_flip() +
  theme_bw() +
  theme(panel.border = element_blank(), legend.position="none")

Visualizing by taxonomic group:

# get sample size by desired category

sample_size <- ind2_data_firstmulti %>%
                    filter(!is.na(indicator2)) %>% 
                    group_by(taxonomic_group) %>% summarize(num=n())

# plot
ind2_data_firstmulti %>% 
  # add sampling size 
  left_join(sample_size) %>%
  mutate(myaxis = paste0(taxonomic_group, " (n= ", num, ")")) %>%
  
  #plot
ggplot(aes(x=myaxis, y=indicator2, fill=taxonomic_group), color=NA) +
      geom_violin(width=2, linewidth = 0)  +
      geom_jitter(size=.5, width = 0.1, color="brown") +
      xlab("") +
      coord_flip() +
      scale_fill_manual(values= taxoncolors,
                    breaks=c(levels(as.factor(ind2_data_firstmulti$taxonomic_group)))) +
      theme_bw() +
      theme(panel.border = element_blank(), legend.position="none", text = element_text(size = 15)) 

Same boxplot

# get sample size by desired category
sample_size <- ind2_data_firstmulti %>%
                    filter(!is.na(indicator2)) %>% 
                    group_by(taxonomic_group) %>% summarize(num=n())

ind2_data_firstmulti %>% 
  # add sampling size 
  left_join(sample_size) %>%
  mutate(myaxis = paste0(taxonomic_group, " (n= ", num, ")")) %>%
  
  #plot
ggplot(aes(x=myaxis, y=indicator2, color=taxonomic_group)) +
      geom_boxplot()  +
      geom_jitter(size=.5, width = 0.1, color="brown") +
      xlab("") +
      coord_flip() +
      scale_color_manual(values= taxoncolors,
                    breaks=c(levels(as.factor(ind2_data_firstmulti$taxonomic_group)))) +
      theme_bw() +
      theme(panel.border = element_blank(), legend.position="none", text = element_text(size = 15)) 

Zoom in to invertebrates by country:

ind2_data_firstmulti %>% 
      filter(!is.na(indicator2)) %>%
      filter(taxonomic_group=="invertebrate") %>% 

ggplot(aes(x=country_assessment, y=indicator2, fill=country_assessment), color=NA) +
      geom_violin()  +
      geom_jitter(size=.7, width = 0.1, color="black") +
      xlab("") +
      coord_flip() +
      theme_bw() +
      theme(panel.border = element_blank(), legend.position="none", text = element_text(size = 15)) +
      ggtitle("Distribution of Indicator 2 for Invertebrates")

Visualizing by IUCN:

# add IUCN data form metadata to ind2
ind2_data_firstmulti$global_IUCN <- metadata_firstmulti$global_IUCN

# get sample size by desired category
sample_size <- ind2_data_firstmulti %>%
                    filter(!is.na(indicator2)) %>% 
                    group_by(global_IUCN) %>% summarize(num=n())

# plot
ind2_data_firstmulti %>% 
  # add sampling size 
  left_join(sample_size) %>%
  mutate(myaxis = paste0(global_IUCN, " (n= ", num, ")")) %>%

  # plot
  ggplot(aes(x=myaxis, y=indicator2, fill=global_IUCN)) +
      geom_violin(width=1, linewidth = 0)  +
      geom_jitter(size=.5, width = 0.1) +
      xlab("") +
      coord_flip() +
      scale_fill_manual(values= IUCNcolors, # iucn color codes
                    breaks=c(levels(as.factor(ind2_data_firstmulti$global_IUCN)))) +
      theme_bw() +
      theme(panel.border = element_blank(), legend.position="none", text= element_text(size=15))

Visualizing by range type:

# add data form metadata to ind2
ind2_data_firstmulti$species_range <- metadata_firstmulti$species_range

# get sample size by desired category
sample_size <- ind2_data_firstmulti %>%
                    filter(!is.na(indicator2)) %>% 
                    group_by(species_range) %>% summarize(num=n())

# plot
ind2_data_firstmulti %>% 
  # add sampling size 
  left_join(sample_size) %>%
  mutate(myaxis = paste0(species_range, " (n= ", num, ")")) %>%

  # plot
  ggplot(aes(x=myaxis, y=indicator2, fill=species_range)) +
      geom_violin(width=1, linewidth = 0)  +
      geom_jitter(size=.5, width = 0.1) +
      xlab("") +
      coord_flip() +
      theme_bw() +
      theme(panel.border = element_blank(), legend.position="none", text= element_text(size=20))

Visualizing by rarity:

# add data form metadata to ind2
ind2_data_firstmulti$rarity <- metadata_firstmulti$rarity

# get sample size by desired category
sample_size <- ind2_data_firstmulti %>%
                    filter(!is.na(indicator2)) %>% 
                    group_by(rarity) %>% summarize(num=n())

# plot
ind2_data_firstmulti %>% 
  # add sampling size 
  left_join(sample_size) %>%
  mutate(myaxis = paste0(rarity, " (n= ", num, ")")) %>%

  # plot
  ggplot(aes(x=myaxis, y=indicator2, fill=rarity)) +
      geom_violin(width=1, linewidth = 0)  +
      geom_jitter(size=.5, width = 0.1) +
      xlab("") +
      coord_flip() +
      theme_bw() +
      theme(panel.border = element_blank(), legend.position="none", text= element_text(size=20))

By population method, boxplot version:

# get sample size by desired category

sample_size <- ind2_data_firstmulti %>%
                    filter(!is.na(indicator2)) %>% 
                    group_by(defined_populations_simplified) %>% summarize(num=n())

# plot
ind2_data_firstmulti %>% 
  # add sampling size 
  left_join(sample_size) %>%
  mutate(myaxis = paste0(defined_populations_simplified, " (n= ", num, ")")) %>%

  # plot
  ggplot(aes(x=myaxis, y=indicator2, color=defined_populations_simplified)) +
  geom_boxplot()  +
  geom_jitter(size=.5, width = 0.1, color="black") +
  xlab("") +
  coord_flip() +
  theme_bw() +
  theme(panel.border = element_blank(), legend.position="none") +
  scale_color_manual(values=simplifiedmethods_colors,
                    breaks=levels(as.factor(ind2_data$defined_populations_simplified)))

Facet by country

ind2_data_firstmulti %>% 

  # plot
  ggplot(aes(x=defined_populations_simplified, y=indicator2, color=defined_populations_simplified)) +
  geom_boxplot()  +
  geom_jitter(size=.5, width = 0.1, color="black") +
  xlab("") +
  coord_flip() +
  theme_bw() +
  theme(panel.border = element_blank(), legend.position="none") + 
  facet_wrap(~country_assessment, ncol=3) +
  scale_color_manual(values=simplifiedmethods_colors,
                    breaks=levels(as.factor(ind2_data_firstmulti$defined_populations_simplified)))

Facet by country and IUCN

ind2_data_firstmulti %>% 

  # plot
 ggplot(aes(x=global_IUCN, y=indicator2, fill=global_IUCN)) +
      geom_violin(width=1, linewidth = 0)  +
      geom_jitter(size=.5, width = 0.1) +
      xlab("") +
      coord_flip() +
      scale_fill_manual(values= IUCNcolors, # iucn color codes
                    breaks=c(levels(as.factor(ind2_data_firstmulti$global_IUCN)))) +
      theme_bw() +
      theme(panel.border = element_blank(), legend.position="none", text= element_text(size=15)) + 
  facet_wrap(~country_assessment, ncol=3) +
  scale_color_manual(values=simplifiedmethods_colors,
                    breaks=levels(as.factor(ind2_data_firstmulti$defined_populations_simplified)))

Indicator 2 within single countries

Value of indicator 2 within a single country, by method to define populations, taxonomic group and iucn. Including only 1 assessment per multiassessed taxa.

### For loop, 3 plots (defined_populations_simplified, taxonomic and global IUCN) by country.
for(i in unique(ind2_data_firstmulti$country_assessment)){


  # filter to desired country in the loop
x <-ind2_data_firstmulti %>% filter(country_assessment==i)


### for pops method

# build plot
p1<- x %>% 
  ggplot(aes(x=defined_populations_simplified, y=indicator2, color=defined_populations_simplified)) +
  geom_boxplot()  +
  geom_jitter(size=.5, width = 0.1, color="black") +
  coord_flip() +
  xlab("") +
  
    ## manually set color
  # use ind2_data_firstmulti instead of x to define number of colors and breaks
  # to ensure same colors are used in all countries
  scale_color_manual(values=simplifiedmethods_colors,
                    breaks=levels(as.factor(ind2_data_firstmulti$defined_populations_simplified))) +
  
  #theme options
  theme_bw() +
  theme(panel.border = element_blank(), legend.position="none") +
  # the plot at the top should have title and no x axis text and labels
  theme(axis.text.x = element_blank()) +
  ylab("") +
  ggtitle(i)

### for taxonomic group
# build plot
p2<- x %>% 
  ggplot(aes(x=taxonomic_group, y=indicator2, color=taxonomic_group)) +
  geom_boxplot()  +
  geom_jitter(size=.5, width = 0.1, color="black") +
  coord_flip() +
  xlab("") +
  
  
  ## manually set color
  # use ind2_data_firstmulti instead of x to define number of colors and breaks
  # to ensure same colors are used in all countries
  scale_color_manual(values=taxoncolors,
                     breaks=levels(as.factor(ind2_data_firstmulti$taxonomic_group))) +
                                       
  # theme options
  theme_bw() +
  theme(panel.border = element_blank(), legend.position="none") +
  # the plot at the middle should have no x axis text and labels and no title
  theme(axis.text.x = element_blank()) +
  ylab("")

### for global IUCN
# build plot
p3<- x %>% 
  ggplot(aes(x=global_IUCN, y=indicator2, color=global_IUCN)) +
  geom_boxplot()  +
  geom_jitter(size=.5, width = 0.1, color="black") +
  coord_flip() +
  xlab("") +
  
  ## manually set color
  # use ind2_data_firstmulti instead of x to define number of colors and breaks
  # to ensure same colors are used in all countries
  scale_color_manual(values=IUCNcolors, # iucn color codes,
                     breaks=levels(as.factor(ind2_data_firstmulti$global_IUCN))) +
                                       
  # theme options
  theme_bw() +
  theme(panel.border = element_blank(), legend.position="none")
  # the plot at the bottom should have axis and labels

#### the 3 plots togheter
# plot_grid from cowplot helps to keep the ploting area aligned.
print(plot_grid(p1, p2, p3, ncol = 1, align = 'v'))
}

Does the number of populations affects the value of indicator 2? (excluding outliers: >200 pops) and incluidng all records of multiassessed taxa:

ind2_data %>%
  # filter outliers with too many pops
  filter(n_extant_populations<200) %>%
  
  # plot
    ggplot(aes(x=n_extant_populations, y=indicator2)) +
    geom_point()

Same, colouring by country

ind2_data %>%
  # filter outliers with too many pops
  filter(n_extant_populations<200) %>%
  
  # plot
    ggplot(aes(x=n_extant_populations, y=indicator2, color=country_assessment)) +
    geom_point() +
    theme_bw()

By global IUCN risk

ind2_data$global_IUCN<-metadata$global_IUCN

ind2_data %>%
  # filter outliers with too many pops
  filter(n_extant_populations<200) %>%
  
  # plot
    ggplot(aes(x=n_extant_populations, y=indicator2, color=global_IUCN)) +
    geom_point() + 
        scale_color_manual(values= IUCNcolors, # iucn color codes
                           breaks=c(levels(as.factor(ind2_data$global_IUCN)))) +
  theme_bw()

By population definition method:

ind2_data %>%
  # filter outliers with too many pops
  filter(n_extant_populations<200) %>%
  
  # plot
    ggplot(aes(x=n_extant_populations, y=indicator2, color=defined_populations_simplified)) +
    geom_point() +
    theme_bw() +
  scale_color_manual(values=simplifiedmethods_colors,
                    breaks=levels(as.factor(ind2_data$defined_populations_simplified)))

Run model (first removing missing data)

# remove missing data 
ind2_data_wo_missing<-ind2_data %>% 
                      filter(!is.na(indicator2)) %>%
                      filter(n_extant_populations<500) # doesn't make a difference in the test below, but useful for plots

# check n per method
table(ind2_data_wo_missing$defined_populations_simplified)
## 
##         adaptive_traits management_units 
##                                       19 
##                       eco_biogeo_proxies 
##                                       30 
##                         genetic_clusters 
##                                       50 
##      genetic_clusters eco_biogeo_proxies 
##                                       12 
##   genetic_clusters geographic_boundaries 
##                                       46 
##                    geographic_boundaries 
##                                      178 
##    geographic_boundaries adaptive_traits 
##                                       32 
## geographic_boundaries eco_biogeo_proxies 
##                                       56 
##   geographic_boundaries management_units 
##                                       17 
##                    low_freq_combinations 
##                                       69 
##                         management_units 
##                                       45 
##                                    other 
##                                        9
# run model
m2 <- glm(ind2_data_wo_missing$indicator2 ~ ind2_data_wo_missing$n_extant_populations + 
            ind2_data_wo_missing$defined_populations_simplified + 
            ind2_data_wo_missing$n_extant_populations*ind2_data_wo_missing$defined_populations_simplified, family = "quasibinomial")
m2
## 
## Call:  glm(formula = ind2_data_wo_missing$indicator2 ~ ind2_data_wo_missing$n_extant_populations + 
##     ind2_data_wo_missing$defined_populations_simplified + ind2_data_wo_missing$n_extant_populations * 
##     ind2_data_wo_missing$defined_populations_simplified, family = "quasibinomial")
## 
## Coefficients:
##                                                                                                                           (Intercept)  
##                                                                                                                               2.40748  
##                                                                                             ind2_data_wo_missing$n_extant_populations  
##                                                                                                                               0.04605  
##                                                                 ind2_data_wo_missing$defined_populations_simplifiedeco_biogeo_proxies  
##                                                                                                                              -0.90120  
##                                                                   ind2_data_wo_missing$defined_populations_simplifiedgenetic_clusters  
##                                                                                                                               0.48522  
##                                                ind2_data_wo_missing$defined_populations_simplifiedgenetic_clusters eco_biogeo_proxies  
##                                                                                                                               1.73254  
##                                             ind2_data_wo_missing$defined_populations_simplifiedgenetic_clusters geographic_boundaries  
##                                                                                                                              -0.27973  
##                                                              ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries  
##                                                                                                                              -1.00348  
##                                              ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries adaptive_traits  
##                                                                                                                               0.12491  
##                                           ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries eco_biogeo_proxies  
##                                                                                                                              -1.08795  
##                                             ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries management_units  
##                                                                                                                              -0.65696  
##                                                              ind2_data_wo_missing$defined_populations_simplifiedlow_freq_combinations  
##                                                                                                                              -0.70210  
##                                                                   ind2_data_wo_missing$defined_populations_simplifiedmanagement_units  
##                                                                                                                              -1.81268  
##                                                                              ind2_data_wo_missing$defined_populations_simplifiedother  
##                                                                                                                              -2.49424  
##                       ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedeco_biogeo_proxies  
##                                                                                                                              -0.04288  
##                         ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgenetic_clusters  
##                                                                                                                              -0.12867  
##      ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgenetic_clusters eco_biogeo_proxies  
##                                                                                                                              -0.18537  
##   ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgenetic_clusters geographic_boundaries  
##                                                                                                                              -0.05244  
##                    ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries  
##                                                                                                                              -0.04408  
##    ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries adaptive_traits  
##                                                                                                                              -0.03108  
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries eco_biogeo_proxies  
##                                                                                                                              -0.05005  
##   ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries management_units  
##                                                                                                                               0.05238  
##                    ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedlow_freq_combinations  
##                                                                                                                              -0.04496  
##                         ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedmanagement_units  
##                                                                                                                              -0.05417  
##                                    ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedother  
##                                                                                                                               0.66367  
## 
## Degrees of Freedom: 562 Total (i.e. Null);  539 Residual
## Null Deviance:       246.2 
## Residual Deviance: 214.7     AIC: NA
summary(m2)
## 
## Call:
## glm(formula = ind2_data_wo_missing$indicator2 ~ ind2_data_wo_missing$n_extant_populations + 
##     ind2_data_wo_missing$defined_populations_simplified + ind2_data_wo_missing$n_extant_populations * 
##     ind2_data_wo_missing$defined_populations_simplified, family = "quasibinomial")
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8020  -0.3222   0.3601   0.5988   0.9748  
## 
## Coefficients:
##                                                                                                                                       Estimate
## (Intercept)                                                                                                                            2.40748
## ind2_data_wo_missing$n_extant_populations                                                                                              0.04605
## ind2_data_wo_missing$defined_populations_simplifiedeco_biogeo_proxies                                                                 -0.90120
## ind2_data_wo_missing$defined_populations_simplifiedgenetic_clusters                                                                    0.48522
## ind2_data_wo_missing$defined_populations_simplifiedgenetic_clusters eco_biogeo_proxies                                                 1.73254
## ind2_data_wo_missing$defined_populations_simplifiedgenetic_clusters geographic_boundaries                                             -0.27973
## ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries                                                              -1.00348
## ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries adaptive_traits                                               0.12491
## ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries eco_biogeo_proxies                                           -1.08795
## ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries management_units                                             -0.65696
## ind2_data_wo_missing$defined_populations_simplifiedlow_freq_combinations                                                              -0.70210
## ind2_data_wo_missing$defined_populations_simplifiedmanagement_units                                                                   -1.81268
## ind2_data_wo_missing$defined_populations_simplifiedother                                                                              -2.49424
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedeco_biogeo_proxies                       -0.04288
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgenetic_clusters                         -0.12867
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgenetic_clusters eco_biogeo_proxies      -0.18537
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgenetic_clusters geographic_boundaries   -0.05244
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries                    -0.04408
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries adaptive_traits    -0.03108
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries eco_biogeo_proxies -0.05005
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries management_units    0.05238
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedlow_freq_combinations                    -0.04496
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedmanagement_units                         -0.05417
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedother                                     0.66367
##                                                                                                                                       Std. Error
## (Intercept)                                                                                                                              0.78714
## ind2_data_wo_missing$n_extant_populations                                                                                                0.12562
## ind2_data_wo_missing$defined_populations_simplifiedeco_biogeo_proxies                                                                    0.85777
## ind2_data_wo_missing$defined_populations_simplifiedgenetic_clusters                                                                      0.93775
## ind2_data_wo_missing$defined_populations_simplifiedgenetic_clusters eco_biogeo_proxies                                                   1.52683
## ind2_data_wo_missing$defined_populations_simplifiedgenetic_clusters geographic_boundaries                                                0.84635
## ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries                                                                 0.79812
## ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries adaptive_traits                                                 0.92914
## ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries eco_biogeo_proxies                                              0.81582
## ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries management_units                                                1.01984
## ind2_data_wo_missing$defined_populations_simplifiedlow_freq_combinations                                                                 0.82259
## ind2_data_wo_missing$defined_populations_simplifiedmanagement_units                                                                      0.82101
## ind2_data_wo_missing$defined_populations_simplifiedother                                                                                 1.22073
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedeco_biogeo_proxies                          0.12576
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgenetic_clusters                            0.16613
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgenetic_clusters eco_biogeo_proxies         0.13815
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgenetic_clusters geographic_boundaries      0.12566
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries                       0.12567
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries adaptive_traits       0.13199
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries eco_biogeo_proxies    0.12567
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries management_units      0.20612
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedlow_freq_combinations                       0.12620
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedmanagement_units                            0.12677
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedother                                       0.58195
##                                                                                                                                       t value
## (Intercept)                                                                                                                             3.059
## ind2_data_wo_missing$n_extant_populations                                                                                               0.367
## ind2_data_wo_missing$defined_populations_simplifiedeco_biogeo_proxies                                                                  -1.051
## ind2_data_wo_missing$defined_populations_simplifiedgenetic_clusters                                                                     0.517
## ind2_data_wo_missing$defined_populations_simplifiedgenetic_clusters eco_biogeo_proxies                                                  1.135
## ind2_data_wo_missing$defined_populations_simplifiedgenetic_clusters geographic_boundaries                                              -0.331
## ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries                                                               -1.257
## ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries adaptive_traits                                                0.134
## ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries eco_biogeo_proxies                                            -1.334
## ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries management_units                                              -0.644
## ind2_data_wo_missing$defined_populations_simplifiedlow_freq_combinations                                                               -0.854
## ind2_data_wo_missing$defined_populations_simplifiedmanagement_units                                                                    -2.208
## ind2_data_wo_missing$defined_populations_simplifiedother                                                                               -2.043
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedeco_biogeo_proxies                        -0.341
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgenetic_clusters                          -0.775
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgenetic_clusters eco_biogeo_proxies       -1.342
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgenetic_clusters geographic_boundaries    -0.417
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries                     -0.351
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries adaptive_traits     -0.235
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries eco_biogeo_proxies  -0.398
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries management_units     0.254
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedlow_freq_combinations                     -0.356
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedmanagement_units                          -0.427
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedother                                      1.140
##                                                                                                                                       Pr(>|t|)
## (Intercept)                                                                                                                            0.00233
## ind2_data_wo_missing$n_extant_populations                                                                                              0.71407
## ind2_data_wo_missing$defined_populations_simplifiedeco_biogeo_proxies                                                                  0.29390
## ind2_data_wo_missing$defined_populations_simplifiedgenetic_clusters                                                                    0.60507
## ind2_data_wo_missing$defined_populations_simplifiedgenetic_clusters eco_biogeo_proxies                                                 0.25699
## ind2_data_wo_missing$defined_populations_simplifiedgenetic_clusters geographic_boundaries                                              0.74114
## ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries                                                               0.20919
## ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries adaptive_traits                                               0.89311
## ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries eco_biogeo_proxies                                            0.18291
## ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries management_units                                              0.51973
## ind2_data_wo_missing$defined_populations_simplifiedlow_freq_combinations                                                               0.39374
## ind2_data_wo_missing$defined_populations_simplifiedmanagement_units                                                                    0.02767
## ind2_data_wo_missing$defined_populations_simplifiedother                                                                               0.04151
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedeco_biogeo_proxies                        0.73325
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgenetic_clusters                          0.43896
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgenetic_clusters eco_biogeo_proxies       0.18024
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgenetic_clusters geographic_boundaries    0.67658
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries                     0.72594
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries adaptive_traits     0.81395
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries eco_biogeo_proxies  0.69058
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries management_units    0.79949
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedlow_freq_combinations                     0.72178
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedmanagement_units                          0.66933
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedother                                     0.25461
##                                                                                                                                         
## (Intercept)                                                                                                                           **
## ind2_data_wo_missing$n_extant_populations                                                                                               
## ind2_data_wo_missing$defined_populations_simplifiedeco_biogeo_proxies                                                                   
## ind2_data_wo_missing$defined_populations_simplifiedgenetic_clusters                                                                     
## ind2_data_wo_missing$defined_populations_simplifiedgenetic_clusters eco_biogeo_proxies                                                  
## ind2_data_wo_missing$defined_populations_simplifiedgenetic_clusters geographic_boundaries                                               
## ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries                                                                
## ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries adaptive_traits                                                
## ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries eco_biogeo_proxies                                             
## ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries management_units                                               
## ind2_data_wo_missing$defined_populations_simplifiedlow_freq_combinations                                                                
## ind2_data_wo_missing$defined_populations_simplifiedmanagement_units                                                                   * 
## ind2_data_wo_missing$defined_populations_simplifiedother                                                                              * 
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedeco_biogeo_proxies                         
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgenetic_clusters                           
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgenetic_clusters eco_biogeo_proxies        
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgenetic_clusters geographic_boundaries     
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries                      
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries adaptive_traits      
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries eco_biogeo_proxies   
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedgeographic_boundaries management_units     
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedlow_freq_combinations                      
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedmanagement_units                           
## ind2_data_wo_missing$n_extant_populations:ind2_data_wo_missing$defined_populations_simplifiedother                                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for quasibinomial family taken to be 0.3995451)
## 
##     Null deviance: 246.17  on 562  degrees of freedom
## Residual deviance: 214.70  on 539  degrees of freedom
## AIC: NA
## 
## Number of Fisher Scoring iterations: 7

Put side by side the plots of number of populations and indicator 2 range (excluding >200 pops outlieres):

## plot for number of populations

# sample size of TOTAL populations
sample_size <- ind2_data %>%
                    filter(!is.na(indicator2)) %>% 
                    group_by(defined_populations_simplified) %>% summarize(num=n())

p1<-ind2_data %>% 
  filter(n_extant_populations<500) %>%
    # add sampling size 
  left_join(sample_size) %>%
  mutate(myaxis = paste0(defined_populations_simplified, " (n= ", num, ")")) %>%
  
# plot
  ggplot(aes(x=myaxis, y=n_extant_populations, color=defined_populations_simplified)) +
          geom_boxplot() + xlab("") + ylab("Number of extant populations") +
          geom_jitter(size=.4, width = 0.1, color="black") +
  coord_flip() +
  theme_bw() +
  theme(panel.border = element_blank(), legend.position="none",
        plot.margin = unit(c(0, 0, 0, 0), "cm")) + # this is used to decrease the space between plots
  scale_color_manual(values=simplifiedmethods_colors,
                    breaks=levels(as.factor(ind2_data$defined_populations_simplified))) +
    theme(text = element_text(size = 15))

## plot for indicator 2
p2<-ind2_data %>% 
  filter(n_extant_populations<500) %>%
  ggplot(aes(x=defined_populations_simplified, y=indicator2, color=defined_populations_simplified)) +
          geom_boxplot() + xlab("") + ylab("Indicator 2") +
          geom_jitter(size=.4, width = 0.1, color="black") +
  coord_flip() +
  theme_bw() +
  theme(panel.border = element_blank(), legend.position="none", axis.text.y = element_blank(),
        plot.margin = unit(c(0, 0, 0, 0), "cm")) + # this is used to decrease the space between plots) + 
  scale_color_manual(values=simplifiedmethods_colors,
                    breaks=levels(as.factor(ind2_data$defined_populations_simplified))) +
  theme(text = element_text(size = 15))

plot_grid(p1, p2, ncol=2, rel_widths = c(1.5,1))

Add the scatter plot of indicator2 and extant pops as a third pannel

p3<- ind2_data %>%
  # filter outliers with too many pops
  filter(n_extant_populations<500) %>%
  
  # plot
    ggplot(aes(x=n_extant_populations, y=indicator2, color=defined_populations_simplified)) +
    geom_point() +
    theme_bw() +
    scale_color_manual(values=simplifiedmethods_colors,
                    breaks=levels(as.factor(ind2_data$defined_populations_simplified))) +
    theme(legend.position = "none") +
    ylab("Indicator 2") +
    xlab("Number of extant populations") +
    theme(text = element_text(size = 15))
p3

plot_grid(p1, p2, p3, ncol=3, rel_widths = c(1.7,1,1), align = "h")  

ANOVAS on the number of extant populations

One-way ANOVA for the effect of the method to define populations on indicator 2, removing the extreme outlier (>1,000 pops)

# subset data without massive outlier
ind2_data_anova<- ind2_data %>% 
  filter(indicator2<1000)

# summary of n per variable
table(ind2_data_anova$defined_populations_simplified)
## 
##         adaptive_traits management_units 
##                                       19 
##                       eco_biogeo_proxies 
##                                       30 
##                         genetic_clusters 
##                                       50 
##      genetic_clusters eco_biogeo_proxies 
##                                       12 
##   genetic_clusters geographic_boundaries 
##                                       46 
##                    geographic_boundaries 
##                                      180 
##    geographic_boundaries adaptive_traits 
##                                       32 
## geographic_boundaries eco_biogeo_proxies 
##                                       56 
##   geographic_boundaries management_units 
##                                       17 
##                    low_freq_combinations 
##                                       69 
##                         management_units 
##                                       45 
##                                    other 
##                                        9
# One way ANOVA method
res.anova.extant<-aov(indicator2 ~ defined_populations_simplified, data=ind2_data_anova)
res.anova.extant
## Call:
##    aov(formula = indicator2 ~ defined_populations_simplified, data = ind2_data_anova)
## 
## Terms:
##                 defined_populations_simplified Residuals
## Sum of Squares                        3.349251 31.065342
## Deg. of Freedom                             11       553
## 
## Residual standard error: 0.2370148
## Estimated effects may be unbalanced
summary(res.anova.extant)
##                                 Df Sum Sq Mean Sq F value   Pr(>F)    
## defined_populations_simplified  11  3.349 0.30448    5.42 3.22e-08 ***
## Residuals                      553 31.065 0.05618                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Same One-way ANOVA for the effect of the method to define populations on indicator 2, but removing outliers >200 pops

# subset data without outliers
ind2_data_anova<- ind2_data %>% 
  filter(indicator2<200)

# summary of n per variable
table(ind2_data_anova$defined_populations_simplified)
## 
##         adaptive_traits management_units 
##                                       19 
##                       eco_biogeo_proxies 
##                                       30 
##                         genetic_clusters 
##                                       50 
##      genetic_clusters eco_biogeo_proxies 
##                                       12 
##   genetic_clusters geographic_boundaries 
##                                       46 
##                    geographic_boundaries 
##                                      180 
##    geographic_boundaries adaptive_traits 
##                                       32 
## geographic_boundaries eco_biogeo_proxies 
##                                       56 
##   geographic_boundaries management_units 
##                                       17 
##                    low_freq_combinations 
##                                       69 
##                         management_units 
##                                       45 
##                                    other 
##                                        9
# One way ANOVA method
res.anova.extant<-aov(indicator2 ~ defined_populations_simplified, data=ind2_data_anova)
res.anova.extant
## Call:
##    aov(formula = indicator2 ~ defined_populations_simplified, data = ind2_data_anova)
## 
## Terms:
##                 defined_populations_simplified Residuals
## Sum of Squares                        3.349251 31.065342
## Deg. of Freedom                             11       553
## 
## Residual standard error: 0.2370148
## Estimated effects may be unbalanced
summary(res.anova.extant)
##                                 Df Sum Sq Mean Sq F value   Pr(>F)    
## defined_populations_simplified  11  3.349 0.30448    5.42 3.22e-08 ***
## Residuals                      553 31.065 0.05618                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Since there significant differences, check which pairs are actually different (Tukey Honest Significant Differences):

# only padj < 0.05
x<-TukeyHSD(res.anova.extant)
x<-as.data.frame(x[[1]])
x[x$`p adj`<0.05, ]
<script data-pagedtable-source type="application/json">
{“columns”:[{“label”:[“”],“name”:[“rn”],“type”:[“”],“align”:[“left”]},{“label”:[“diff”],“name”:[1],“type”:[“dbl”],“align”:[“right”]},{“label”:[“lwr”],“name”:[2],“type”:[“dbl”],“align”:[“right”]},{“label”:[“upr”],“name”:[3],“type”:[“dbl”],“align”:[“right”]},{“label”:[“p adj”],“name”:[4],“type”:[“dbl”],“align”:[“right”]}],“data”:[{“1”:“-0.3019396”,“2”:“-0.5147739”,“3”:“-0.089105408”,“4”:“2.516395e-04”,“rn”:“management_units-adaptive_traits management_units”},{“1”:“-0.2008192”,“2”:“-0.3841766”,“3”:“-0.017461895”,“4”:“1.818658e-02”,“rn”:“management_units-eco_biogeo_proxies”},{“1”:“-0.1262098”,“2”:“-0.2505688”,“3”:“-0.001850756”,“4”:“4.305017e-02”,“rn”:“geographic_boundaries-genetic_clusters”},{“1”:“-0.1564425”,“2”:“-0.3078016”,“3”:“-0.005083409”,“4”:“3.550944e-02”,“rn”:“geographic_boundaries eco_biogeo_proxies-genetic_clusters”},{“1”:“-0.3031382”,“2”:“-0.4629854”,“3”:“-0.143291001”,“4”:“6.329219e-08”,“rn”:“management_units-genetic_clusters”},{“1”:“-0.2758263”,“2”:“-0.5285669”,“3”:“-0.023085643”,“4”:“1.900037e-02”,“rn”:“management_units-genetic_clusters eco_biogeo_proxies”},{“1”:“-0.2430020”,“2”:“-0.4061081”,“3”:“-0.079895959”,“4”:“8.400203e-05”,“rn”:“management_units-genetic_clusters geographic_boundaries”},{“1”:“-0.1769285”,“2”:“-0.3065817”,“3”:“-0.047275251”,“4”:“5.587294e-04”,“rn”:“management_units-geographic_boundaries”},{“1”:“-0.3024332”,“2”:“-0.4823197”,“3”:“-0.122546624”,“4”:“3.431045e-06”,“rn”:“management_units-geographic_boundaries adaptive_traits”},{“1”:“-0.2560209”,“2”:“-0.4774831”,“3”:“-0.034558622”,“4”:“8.946065e-03”,“rn”:“management_units-geographic_boundaries management_units”},{“1”:“-0.2169626”,“2”:“-0.3660209”,“3”:“-0.067904295”,“4”:“1.434205e-04”,“rn”:“management_units-low_freq_combinations”}],“options”:{“columns”:{“min”:{},“max”:[10]},“rows”:{“min”:[10],“max”:[10]},“pages”:{}}}

One-way ANOVA for the effect of the country on indicator 2, removing outliers >200 pops

# subset data without outliers
ind2_data_anova<- ind2_data %>% 
  filter(indicator2<200)

# summary of n per variable
table(ind2_data_anova$country_assessment)
## 
##     australia       belgium      colombia        france         japan 
##            28            27            36            34            50 
##        mexico  south_africa        sweden united_states 
##            28            91           125           146
# One way ANOVA method
res.anova.extant<-aov(indicator2 ~ country_assessment, data=ind2_data_anova)
res.anova.extant
## Call:
##    aov(formula = indicator2 ~ country_assessment, data = ind2_data_anova)
## 
## Terms:
##                 country_assessment Residuals
## Sum of Squares             6.57625  27.83834
## Deg. of Freedom                  8       556
## 
## Residual standard error: 0.2237609
## Estimated effects may be unbalanced
summary(res.anova.extant)
##                     Df Sum Sq Mean Sq F value Pr(>F)    
## country_assessment   8  6.576  0.8220   16.42 <2e-16 ***
## Residuals          556 27.838  0.0501                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Since there significant differences, check which pairs are actually different (Tukey Honest Significant Differences):

# only padj < 0.05
x<-TukeyHSD(res.anova.extant)
x<-as.data.frame(x[[1]])
x[x$`p adj`<0.05, ]
<script data-pagedtable-source type="application/json">
{“columns”:[{“label”:[“”],“name”:[“rn”],“type”:[“”],“align”:[“left”]},{“label”:[“diff”],“name”:[1],“type”:[“dbl”],“align”:[“right”]},{“label”:[“lwr”],“name”:[2],“type”:[“dbl”],“align”:[“right”]},{“label”:[“upr”],“name”:[3],“type”:[“dbl”],“align”:[“right”]},{“label”:[“p adj”],“name”:[4],“type”:[“dbl”],“align”:[“right”]}],“data”:[{“1”:“-0.4501955”,“2”:“-0.638140376”,“3”:“-0.262250604”,“4”:“4.566098e-10”,“rn”:“belgium-australia”},{“1”:“0.3124810”,“2”:“0.135083661”,“3”:“0.489878426”,“4”:“2.235340e-06”,“rn”:“colombia-belgium”},{“1”:“0.4010613”,“2”:“0.221441724”,“3”:“0.580680857”,“4”:“8.062411e-10”,“rn”:“france-belgium”},{“1”:“0.4714192”,“2”:“0.305005710”,“3”:“0.637832702”,“4”:“4.445180e-10”,“rn”:“japan-belgium”},{“1”:“0.4830699”,“2”:“0.295125041”,“3”:“0.671014813”,“4”:“4.447910e-10”,“rn”:“mexico-belgium”},{“1”:“0.4958742”,“2”:“0.343170891”,“3”:“0.648577540”,“4”:“4.445251e-10”,“rn”:“south_africa-belgium”},{“1”:“0.3288397”,“2”:“0.180964600”,“3”:“0.476714746”,“4”:“8.792915e-10”,“rn”:“sweden-belgium”},{“1”:“0.3531146”,“2”:“0.207140877”,“3”:“0.499088406”,“4”:“4.517717e-10”,“rn”:“united_states-belgium”},{“1”:“0.1589382”,“2”:“0.006630049”,“3”:“0.311246276”,“4”:“3.321074e-02”,“rn”:“japan-colombia”},{“1”:“0.1833932”,“2”:“0.046197636”,“3”:“0.320588708”,“4”:“1.204836e-03”,“rn”:“south_africa-colombia”},{“1”:“-0.1425795”,“2”:“-0.259176991”,“3”:“-0.025982076”,“4”:“4.877067e-03”,“rn”:“sweden-japan”},{“1”:“-0.1183046”,“2”:“-0.232481050”,“3”:“-0.004128079”,“4”:“3.565271e-02”,“rn”:“united_states-japan”},{“1”:“-0.1542303”,“2”:“-0.299917576”,“3”:“-0.008542933”,“4”:“2.865942e-02”,“rn”:“sweden-mexico”},{“1”:“-0.1670345”,“2”:“-0.263054440”,“3”:“-0.071014645”,“4”:“3.212117e-06”,“rn”:“sweden-south_africa”},{“1”:“-0.1427596”,“2”:“-0.235824731”,“3”:“-0.049694417”,“4”:“7.934629e-05”,“rn”:“united_states-south_africa”}],“options”:{“columns”:{“min”:{},“max”:[10]},“rows”:{“min”:[10],“max”:[10]},“pages”:{}}}

One-way ANOVA for the effect of the taxonomic group on indicator 2, removing outliers >200 pops and taxonomic groups with too few data

# summary of n per variable
table(ind2_data$taxonomic_group)
## 
##     amphibian    angiosperm          bird     bryophyte          fish 
##            63           233           140             5            70 
##        fungus    gymnosperm  invertebrate        mammal         other 
##             3            19           148           140            17 
## pteridophytes       reptile 
##            14            72
# subset data 
ind2_data_anova<- ind2_data %>% 
  filter(indicator2<200) %>% 
  filter(taxonomic_group %!in% c("fungus", "bryophyte", "other", "pteridophytes"))

# summary of n per variable
table(ind2_data_anova$taxonomic_group)
## 
##    amphibian   angiosperm         bird         fish   gymnosperm invertebrate 
##           50          141           84           49            9           83 
##       mammal      reptile 
##           82           40
# One way ANOVA method
res.anova.extant<-aov(indicator2 ~ taxonomic_group, data=ind2_data_anova)
res.anova.extant
## Call:
##    aov(formula = indicator2 ~ taxonomic_group, data = ind2_data_anova)
## 
## Terms:
##                 taxonomic_group Residuals
## Sum of Squares         3.835361 29.572753
## Deg. of Freedom               7       530
## 
## Residual standard error: 0.2362153
## Estimated effects may be unbalanced
summary(res.anova.extant)
##                  Df Sum Sq Mean Sq F value   Pr(>F)    
## taxonomic_group   7  3.835  0.5479    9.82 1.55e-11 ***
## Residuals       530 29.573  0.0558                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Since there significant differences, check which pairs are actually different (Tukey Honest Significant Differences):

# only padj < 0.05
x<-TukeyHSD(res.anova.extant)
x<-as.data.frame(x[[1]])
x[x$`p adj`<0.05, ]

Two-way ANOVA with interaction effect of the method to define populations and the country, removing outliers >200 pops Question: is interaction he correct model? or should it be additive?

# subset data without outliers
ind2_data_anova<- ind2_data %>% 
  filter(indicator2<200)

# summary of n per variable
ind2_data_anova %>%
  group_by(country_assessment, defined_populations_simplified) %>% summarise(n=n())
# Two-way ANOVA with interaction effect of the method to define populations and the country
res.anova.extant<-aov(indicator2 ~ defined_populations_simplified * country_assessment , data=ind2_data_anova)
res.anova.extant
## Call:
##    aov(formula = indicator2 ~ defined_populations_simplified * country_assessment, 
##     data = ind2_data_anova)
## 
## Terms:
##                 defined_populations_simplified country_assessment
## Sum of Squares                        3.349251           3.683287
## Deg. of Freedom                             11                  8
##                 defined_populations_simplified:country_assessment Residuals
## Sum of Squares                                           1.711872 25.670183
## Deg. of Freedom                                                38       507
## 
## Residual standard error: 0.2250145
## 50 out of 108 effects not estimable
## Estimated effects may be unbalanced
summary(res.anova.extant)
##                                                    Df Sum Sq Mean Sq F value
## defined_populations_simplified                     11  3.349  0.3045   6.014
## country_assessment                                  8  3.683  0.4604   9.093
## defined_populations_simplified:country_assessment  38  1.712  0.0450   0.890
## Residuals                                         507 25.670  0.0506        
##                                                     Pr(>F)    
## defined_populations_simplified                    2.88e-09 ***
## country_assessment                                1.03e-11 ***
## defined_populations_simplified:country_assessment     0.66    
## Residuals                                                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Since there significant differences, check which pairs are actually different (Tukey Honest Significant Differences):

# only padj < 0.05
x<-TukeyHSD(res.anova.extant)
x<-as.data.frame(x[[1]])
x[x$`p adj`<0.05, ]

Two-way ANOVA with interaction effect of the method to define populations and the country, but keeping only groups with enough n

# variables with enough n
enough_n<-ind2_data %>%
  group_by(country_assessment, defined_populations_simplified) %>% 
  summarise(n=n()) %>% 
  filter(n>=15)
## `summarise()` has grouped output by 'country_assessment'. You can override
## using the `.groups` argument.
# subset data without outliers and with enough n
ind2_data_anova<- ind2_data %>% 
  filter(indicator2<200) %>% 
  # this gives the country
  filter(country_assessment==unique(enough_n$country_assessment)[1] & 
           #this gives the methods for that country (the last [[1]] is to get the results out of a list)
           defined_populations_simplified %in% enough_n[enough_n$country_assessment==unique(enough_n$country_assessment)[1], 2][[1]]  | 
           
           # the same for rest of countries. Notice the use of & for methods within country and | to change to other country
           country_assessment==unique(enough_n$country_assessment)[2] & 
           defined_populations_simplified %in% enough_n[enough_n$country_assessment==unique(enough_n$country_assessment)[2], 2][[1]] | 
           
           country_assessment==unique(enough_n$country_assessment)[3] & 
           defined_populations_simplified %in% enough_n[enough_n$country_assessment==unique(enough_n$country_assessment)[3], 2][[1]] |
           
           country_assessment==unique(enough_n$country_assessment)[4] & 
           defined_populations_simplified %in% enough_n[enough_n$country_assessment==unique(enough_n$country_assessment)[4], 2][[1]] | 
           
           country_assessment==unique(enough_n$country_assessment)[5] & 
           defined_populations_simplified %in% enough_n[enough_n$country_assessment==unique(enough_n$country_assessment)[5], 2][[1]] | 
           
           country_assessment==unique(enough_n$country_assessment)[6] & 
           defined_populations_simplified %in% enough_n[enough_n$country_assessment==unique(enough_n$country_assessment)[6], 2][[1]] | 
           
           country_assessment==unique(enough_n$country_assessment)[7] & 
           defined_populations_simplified %in% enough_n[enough_n$country_assessment==unique(enough_n$country_assessment)[7], 2][[1]] | 
           
           country_assessment==unique(enough_n$country_assessment)[8] & 
           defined_populations_simplified %in% enough_n[enough_n$country_assessment==unique(enough_n$country_assessment)[8], 2][[1]])

# summary of n per variable
ind2_data_anova %>%
  group_by(country_assessment, defined_populations_simplified) %>% summarise(n=n())
## `summarise()` has grouped output by 'country_assessment'. You can override
## using the `.groups` argument.
# Two-way ANOVA with interaction effect of the method to define populations and the country
res.anova.extant<-aov(indicator2 ~ defined_populations_simplified * country_assessment , data=ind2_data_anova)
res.anova.extant
## Call:
##    aov(formula = indicator2 ~ defined_populations_simplified * country_assessment, 
##     data = ind2_data_anova)
## 
## Terms:
##                 defined_populations_simplified country_assessment
## Sum of Squares                        4.502732           1.409864
## Deg. of Freedom                              7                  4
##                 defined_populations_simplified:country_assessment Residuals
## Sum of Squares                                           0.010695 14.329461
## Deg. of Freedom                                                 3       294
## 
## Residual standard error: 0.2207706
## 49 out of 64 effects not estimable
## Estimated effects may be unbalanced
summary(res.anova.extant)
##                                                    Df Sum Sq Mean Sq F value
## defined_populations_simplified                      7  4.503  0.6432  13.198
## country_assessment                                  4  1.410  0.3525   7.232
## defined_populations_simplified:country_assessment   3  0.011  0.0036   0.073
## Residuals                                         294 14.329  0.0487        
##                                                     Pr(>F)    
## defined_populations_simplified                    8.61e-15 ***
## country_assessment                                1.45e-05 ***
## defined_populations_simplified:country_assessment    0.974    
## Residuals                                                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Since there significant differences, check which pairs are actually different (Tukey Honest Significant Differences):

# only padj < 0.05
x<-TukeyHSD(res.anova.extant)
x<-as.data.frame(x[[1]])
x[x$`p adj`<0.05, ]
<script data-pagedtable-source type="application/json">
{“columns”:[{“label”:[“”],“name”:[“rn”],“type”:[“”],“align”:[“left”]},{“label”:[“diff”],“name”:[1],“type”:[“dbl”],“align”:[“right”]},{“label”:[“lwr”],“name”:[2],“type”:[“dbl”],“align”:[“right”]},{“label”:[“upr”],“name”:[3],“type”:[“dbl”],“align”:[“right”]},{“label”:[“p adj”],“name”:[4],“type”:[“dbl”],“align”:[“right”]}],“data”:[{“1”:“-0.4959211”,“2”:“-0.7093227”,“3”:“-0.282519566”,“4”:“2.728338e-10”,“rn”:“management_units-adaptive_traits management_units”},{“1”:“-0.1294251”,“2”:“-0.2552278”,“3”:“-0.003622326”,“4”:“3.872277e-02”,“rn”:“geographic_boundaries-genetic_clusters”},{“1”:“-0.1718010”,“2”:“-0.3295851”,“3”:“-0.014016829”,“4”:“2.207185e-02”,“rn”:“geographic_boundaries eco_biogeo_proxies-genetic_clusters”},{“1”:“-0.5007798”,“2”:“-0.6849238”,“3”:“-0.316635841”,“4”:“8.852918e-13”,“rn”:“management_units-genetic_clusters”},{“1”:“-0.4770185”,“2”:“-0.6651598”,“3”:“-0.288877091”,“4”:“5.258016e-12”,“rn”:“management_units-genetic_clusters geographic_boundaries”},{“1”:“-0.3713547”,“2”:“-0.5300385”,“3”:“-0.212670959”,“4”:“2.006239e-10”,“rn”:“management_units-geographic_boundaries”},{“1”:“-0.4698739”,“2”:“-0.6863652”,“3”:“-0.253382619”,“4”:“4.592031e-09”,“rn”:“management_units-geographic_boundaries adaptive_traits”},{“1”:“-0.3289788”,“2”:“-0.5140465”,“3”:“-0.143911179”,“4”:“3.317563e-06”,“rn”:“management_units-geographic_boundaries eco_biogeo_proxies”},{“1”:“-0.4673140”,“2”:“-0.6872072”,“3”:“-0.247420839”,“4”:“1.028749e-08”,“rn”:“management_units-low_freq_combinations”}],“options”:{“columns”:{“min”:{},“max”:[10]},“rows”:{“min”:[10],“max”:[10]},“pages”:{}}}

ANOVAs on indicator 2

One-way ANOVA for the effect of the method to define populations on indicator 2, removing the extreme outlier (>1,000 pops)

# subset data without massive outlier
ind2_data_anova<- ind2_data %>% 
  filter(indicator2<1000)

# summary of n per variable
table(ind2_data_anova$defined_populations_simplified)
## 
##         adaptive_traits management_units 
##                                       19 
##                       eco_biogeo_proxies 
##                                       30 
##                         genetic_clusters 
##                                       50 
##      genetic_clusters eco_biogeo_proxies 
##                                       12 
##   genetic_clusters geographic_boundaries 
##                                       46 
##                    geographic_boundaries 
##                                      180 
##    geographic_boundaries adaptive_traits 
##                                       32 
## geographic_boundaries eco_biogeo_proxies 
##                                       56 
##   geographic_boundaries management_units 
##                                       17 
##                    low_freq_combinations 
##                                       69 
##                         management_units 
##                                       45 
##                                    other 
##                                        9
# One way ANOVA method
res.anova.extant<-aov(indicator2 ~ defined_populations_simplified, data=ind2_data_anova)
summary(res.anova.extant)
##                                 Df Sum Sq Mean Sq F value   Pr(>F)    
## defined_populations_simplified  11  3.349 0.30448    5.42 3.22e-08 ***
## Residuals                      553 31.065 0.05618                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Same One-way ANOVA for the effect of the method to define populations on indicator 2, but removing outliers >1000 pops

# subset data without outliers
ind2_data_anova<- ind2_data %>% 
  filter(indicator2<1000)

# summary of n per variable
table(ind2_data_anova$defined_populations_simplified)
## 
##         adaptive_traits management_units 
##                                       19 
##                       eco_biogeo_proxies 
##                                       30 
##                         genetic_clusters 
##                                       50 
##      genetic_clusters eco_biogeo_proxies 
##                                       12 
##   genetic_clusters geographic_boundaries 
##                                       46 
##                    geographic_boundaries 
##                                      180 
##    geographic_boundaries adaptive_traits 
##                                       32 
## geographic_boundaries eco_biogeo_proxies 
##                                       56 
##   geographic_boundaries management_units 
##                                       17 
##                    low_freq_combinations 
##                                       69 
##                         management_units 
##                                       45 
##                                    other 
##                                        9
# One way ANOVA method
res.anova.extant<-aov(indicator2 ~ defined_populations_simplified, data=ind2_data_anova)
summary(res.anova.extant)
##                                 Df Sum Sq Mean Sq F value   Pr(>F)    
## defined_populations_simplified  11  3.349 0.30448    5.42 3.22e-08 ***
## Residuals                      553 31.065 0.05618                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Since there significant differences, check which pairs are actually different (Tukey Honest Significant Differences):

# only padj < 0.05
x<-TukeyHSD(res.anova.extant)
x<-as.data.frame(x[[1]])
x[x$`p adj`<0.05, ]

One-way ANOVA for the effect of the country on indicator 2, removing outliers >1000 pops

# subset data without outliers
ind2_data_anova<- ind2_data %>% 
  filter(indicator2<1000)

# summary of n per variable
table(ind2_data_anova$country_assessment)
## 
##     australia       belgium      colombia        france         japan 
##            28            27            36            34            50 
##        mexico  south_africa        sweden united_states 
##            28            91           125           146
# One way ANOVA method
res.anova.extant<-aov(indicator2 ~ country_assessment, data=ind2_data_anova)
summary(res.anova.extant)
##                     Df Sum Sq Mean Sq F value Pr(>F)    
## country_assessment   8  6.576  0.8220   16.42 <2e-16 ***
## Residuals          556 27.838  0.0501                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Since there significant differences, check which pairs are actually different (Tukey Honest Significant Differences):

# only padj < 0.05
x<-TukeyHSD(res.anova.extant)
x<-as.data.frame(x[[1]])
x[x$`p adj`<0.05, ]

One-way ANOVA for the effect of the taxonomic group on indicator 2, removing outliers >1000 pops and taxonomic groups with too few data

# summary of n per variable
table(ind2_data$taxonomic_group)
## 
##     amphibian    angiosperm          bird     bryophyte          fish 
##            63           233           140             5            70 
##        fungus    gymnosperm  invertebrate        mammal         other 
##             3            19           148           140            17 
## pteridophytes       reptile 
##            14            72
# subset data 
ind2_data_anova<- ind2_data %>% 
  filter(indicator2<1000) %>% 
  filter(taxonomic_group %!in% c("fungus", "bryophyte", "other"))

# summary of n per variable
table(ind2_data_anova$taxonomic_group)
## 
##     amphibian    angiosperm          bird          fish    gymnosperm 
##            50           141            84            49             9 
##  invertebrate        mammal pteridophytes       reptile 
##            83            82             8            40
# One way ANOVA
res.anova.extant<-aov(indicator2 ~ taxonomic_group, data=ind2_data_anova)
summary(res.anova.extant)
##                  Df Sum Sq Mean Sq F value   Pr(>F)    
## taxonomic_group   8  3.836  0.4794   8.578 4.96e-11 ***
## Residuals       537 30.014  0.0559                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Since there significant differences, check which pairs are actually different (Tukey Honest Significant Differences):

# only padj < 0.05
x<-TukeyHSD(res.anova.extant)
x<-as.data.frame(x[[1]])
x[x$`p adj`<0.05, ]

One-way ANOVA for the effect of the global IUCN on indicator 2, removing outliers >1000 pops and taxonomic groups with too few data

# summary of n per variable
table(ind2_data$global_IUCN)
## 
##           cr           dd           en           lc not_assessed           nt 
##           63           15          104          280          259           86 
##      unknown           vu 
##            4          112
# subset data 
ind2_data_anova<- ind2_data %>% 
  filter(indicator2<1000) %>% 
  filter(global_IUCN %!in% c("dd", "unknown"))

# summary of n per variable
table(ind2_data_anova$global_IUCN)
## 
##           cr           en           lc not_assessed           nt           vu 
##           37           62          157          166           57           75
# One way ANOVA method
res.anova.extant<-aov(indicator2 ~ global_IUCN, data=ind2_data_anova)
summary(res.anova.extant)
##              Df Sum Sq Mean Sq F value Pr(>F)
## global_IUCN   5   0.26 0.05136   0.848  0.516
## Residuals   548  33.18 0.06055

Since there significant differences, check which pairs are actually different (Tukey Honest Significant Differences):

# only padj < 0.05
x<-TukeyHSD(res.anova.extant)
x<-as.data.frame(x[[1]])
x[x$`p adj`<0.05, ]

One-way ANOVA for the effect of the species range on indicator 2

# summary of n per variable
table(ind2_data$species_range)
## 
##   restricted      unknown wide_ranging 
##          509           26          389
# subset data 
ind2_data_anova<- ind2_data %>% 
                  filter(species_range != "unknown")

# summary of n per variable
table(ind2_data_anova$species_range)
## 
##   restricted wide_ranging 
##          509          389
# One way ANOVA method
res.anova.extant<-aov(indicator2 ~ species_range, data=ind2_data_anova)
summary(res.anova.extant)
##                Df Sum Sq Mean Sq F value  Pr(>F)   
## species_range   1   0.43  0.4267   7.094 0.00796 **
## Residuals     548  32.96  0.0601                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 348 observations deleted due to missingness

One-way ANOVA for the effect of the rarity on indicator 2

# summary of n per variable

table(ind2_data_firstmulti$rarity)
## 
##     not_rare rare_natural  rare_recent 
##          371          354          152
# subset data 
ind2_data_anova<- ind2_data_firstmulti


# One way ANOVA method
res.anova.extant<-aov(indicator2 ~ species_range, data=ind2_data_anova)
summary(res.anova.extant)
##                Df Sum Sq Mean Sq F value Pr(>F)  
## species_range   2   0.51  0.2547   4.163 0.0161 *
## Residuals     528  32.31  0.0612                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 346 observations deleted due to missingness

Two-way ANOVA with interaction effect of the method to define populations and the country, removing outliers >200 pops Question: is interaction he correct model? or should it be additive?

# subset data without outliers
ind2_data_anova<- ind2_data %>% 
  filter(indicator2<200)

# summary of n per variable
ind2_data_anova %>%
  group_by(country_assessment, defined_populations_simplified) %>% summarise(n=n())
# Two-way ANOVA with interaction effect of the method to define populations and the country
res.anova.extant<-aov(indicator2 ~ defined_populations_simplified * country_assessment , data=ind2_data_anova)
res.anova.extant
## Call:
##    aov(formula = indicator2 ~ defined_populations_simplified * country_assessment, 
##     data = ind2_data_anova)
## 
## Terms:
##                 defined_populations_simplified country_assessment
## Sum of Squares                        3.349251           3.683287
## Deg. of Freedom                             11                  8
##                 defined_populations_simplified:country_assessment Residuals
## Sum of Squares                                           1.711872 25.670183
## Deg. of Freedom                                                38       507
## 
## Residual standard error: 0.2250145
## 50 out of 108 effects not estimable
## Estimated effects may be unbalanced
summary(res.anova.extant)
##                                                    Df Sum Sq Mean Sq F value
## defined_populations_simplified                     11  3.349  0.3045   6.014
## country_assessment                                  8  3.683  0.4604   9.093
## defined_populations_simplified:country_assessment  38  1.712  0.0450   0.890
## Residuals                                         507 25.670  0.0506        
##                                                     Pr(>F)    
## defined_populations_simplified                    2.88e-09 ***
## country_assessment                                1.03e-11 ***
## defined_populations_simplified:country_assessment     0.66    
## Residuals                                                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Since there significant differences, check which pairs are actually different (Tukey Honest Significant Differences):

# only padj < 0.05
x<-TukeyHSD(res.anova.extant)
x<-as.data.frame(x[[1]])
x[x$`p adj`<0.05, ]

Two-way ANOVA with interaction effect of the method to define populations and the country, but keeping only groups with enough n

# variables with enough n
enough_n<-ind2_data %>%
  group_by(country_assessment, defined_populations_simplified) %>% 
  summarise(n=n()) %>% 
  filter(n>=15)
## `summarise()` has grouped output by 'country_assessment'. You can override
## using the `.groups` argument.
# subset data without outliers and with enough n
ind2_data_anova<- ind2_data %>% 
  filter(indicator2<200) %>% 
  # this gives the country
  filter(country_assessment==unique(enough_n$country_assessment)[1] & 
           #this gives the methods for that country (the last [[1]] is to get the results out of a list)
           defined_populations_simplified %in% enough_n[enough_n$country_assessment==unique(enough_n$country_assessment)[1], 2][[1]]  | 
           
           # the same for rest of countries. Notice the use of & for methods within country and | to change to other country
           country_assessment==unique(enough_n$country_assessment)[2] & 
           defined_populations_simplified %in% enough_n[enough_n$country_assessment==unique(enough_n$country_assessment)[2], 2][[1]] | 
           
           country_assessment==unique(enough_n$country_assessment)[3] & 
           defined_populations_simplified %in% enough_n[enough_n$country_assessment==unique(enough_n$country_assessment)[3], 2][[1]] |
           
           country_assessment==unique(enough_n$country_assessment)[4] & 
           defined_populations_simplified %in% enough_n[enough_n$country_assessment==unique(enough_n$country_assessment)[4], 2][[1]] | 
           
           country_assessment==unique(enough_n$country_assessment)[5] & 
           defined_populations_simplified %in% enough_n[enough_n$country_assessment==unique(enough_n$country_assessment)[5], 2][[1]] | 
           
           country_assessment==unique(enough_n$country_assessment)[6] & 
           defined_populations_simplified %in% enough_n[enough_n$country_assessment==unique(enough_n$country_assessment)[6], 2][[1]] | 
           
           country_assessment==unique(enough_n$country_assessment)[7] & 
           defined_populations_simplified %in% enough_n[enough_n$country_assessment==unique(enough_n$country_assessment)[7], 2][[1]] | 
           
           country_assessment==unique(enough_n$country_assessment)[8] & 
           defined_populations_simplified %in% enough_n[enough_n$country_assessment==unique(enough_n$country_assessment)[8], 2][[1]])

# summary of n per variable
ind2_data_anova %>%
  group_by(country_assessment, defined_populations_simplified) %>% summarise(n=n())
## `summarise()` has grouped output by 'country_assessment'. You can override
## using the `.groups` argument.
# Two-way ANOVA with interaction effect of the method to define populations and the country
res.anova.extant<-aov(indicator2 ~ defined_populations_simplified * country_assessment , data=ind2_data_anova)
res.anova.extant
## Call:
##    aov(formula = indicator2 ~ defined_populations_simplified * country_assessment, 
##     data = ind2_data_anova)
## 
## Terms:
##                 defined_populations_simplified country_assessment
## Sum of Squares                        4.502732           1.409864
## Deg. of Freedom                              7                  4
##                 defined_populations_simplified:country_assessment Residuals
## Sum of Squares                                           0.010695 14.329461
## Deg. of Freedom                                                 3       294
## 
## Residual standard error: 0.2207706
## 49 out of 64 effects not estimable
## Estimated effects may be unbalanced
summary(res.anova.extant)
##                                                    Df Sum Sq Mean Sq F value
## defined_populations_simplified                      7  4.503  0.6432  13.198
## country_assessment                                  4  1.410  0.3525   7.232
## defined_populations_simplified:country_assessment   3  0.011  0.0036   0.073
## Residuals                                         294 14.329  0.0487        
##                                                     Pr(>F)    
## defined_populations_simplified                    8.61e-15 ***
## country_assessment                                1.45e-05 ***
## defined_populations_simplified:country_assessment    0.974    
## Residuals                                                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Since there significant differences, check which pairs are actually different (Tukey Honest Significant Differences):

# only padj < 0.05
x<-TukeyHSD(res.anova.extant)
x<-as.data.frame(x[[1]])
x[x$`p adj`<0.05, ]

Estimate indicator 3 (number of taxa with genetic monitoring squemes)

Indicator 3 refers to the number (count) of taxa by country in which genetic monitoring is occurring. This is stored in the variable temp_gen_monitoring as a “yes/no” answer for each taxon, so to estimate the indicator, we only need to count how many said “yes”, keeping only one of the records when the taxon was multiassessed:

indicator3<-ind3_data %>%
                 # keep only one record if the taxon was assessed more than once within the country
                 select(country_assessment, taxon, temp_gen_monitoring) %>%
                 filter(!duplicated(.)) %>%

                 # count "yes" in tem_gen_monitoring by country
                 filter(temp_gen_monitoring=="yes") %>%
                 group_by(country_assessment) %>%
                 summarise(n_taxon_gen_monitoring= n())

Plot indicator 3 by country:

 ## from the table counting studies by country
indicator3 %>%
  ggplot(aes(x=country_assessment, y=n_taxon_gen_monitoring)) +
  geom_bar(stat="identity")

## from raw data to include other variables for color by taxonomic group
ind3_data %>%
                 # keep only one record if the taxon was assessed more than once within the country
                 select(country_assessment, taxon, temp_gen_monitoring, taxonomic_group) %>%
                 filter(!duplicated(.)) %>%

                 # count "yes" in tem_gen_monitoring by country
                 filter(temp_gen_monitoring=="yes") %>%
ggplot(aes(x=country_assessment, fill=taxonomic_group)) +
  geom_bar() +
  scale_fill_manual(values= taxoncolors,
                    breaks=levels(as.factor(ind3_data$taxonomic_group))) +
  theme_bw()

## by iucn

ind3_data$global_IUCN<-metadata$global_IUCN

ind3_data %>%
                 # keep only one record if the taxon was assessed more than once within the country
                 select(country_assessment, taxon, temp_gen_monitoring, global_IUCN) %>%
                 filter(!duplicated(.)) %>%

                 # count "yes" in tem_gen_monitoring by country
                 filter(temp_gen_monitoring=="yes") %>%
ggplot(aes(x=country_assessment, fill=global_IUCN)) +
  geom_bar() +
  scale_fill_manual(values= IUCNcolors, # iucn color codes
                    breaks=levels(as.factor(ind3_data$global_IUCN))) +
      theme_bw()

Relatively few taxa have genetic monitoring, but many have some sort of genetic study. Let’s check that, but first subset the ind3_data keeping only taxa assessed a single time, plust the first record of those assessed multiple times.

#subset keeping only taxa assessed a single time, plust the first record of those assessed multiple times.
ind3_data_firstmulti<-ind3_data[!duplicated(cbind(ind3_data$taxon, ind3_data$country_assessment)), ]

Sankey plot of genetic studies

# transform data to how ggsankey wants it
df <- ind3_data_firstmulti %>%
  make_long(country_assessment, temp_gen_monitoring, gen_studies)

# plot
ggplot(df, aes(x = x,
               next_x = next_x,
               node = node,
               next_node = next_node,
               fill = factor(node),
               label = node)) +
  geom_sankey(flow.alpha = 0.5, 
              show.legend = FALSE) +
  geom_sankey_label(size = 2.5, color = "black", fill = "white") +
  theme_sankey(base_size = 10) +

    # manually set flow fill according to desired color
                            # countries
  scale_fill_manual(values=c(scales::hue_pal()(length(unique(ind3_data_firstmulti$country_assessment))),  
                             # traffic light for monitoring
                             c("darkolivegreen", "brown3", "darkgrey"),
                             # nice soft colors for gen_studies
                             c("grey50", "grey35", "grey50", "brown3")),
                              
                    breaks=c(unique(ind3_data_firstmulti$country_assessment),
                             unique(ind3_data_firstmulti$temp_gen_monitoring),
                             unique(ind3_data_firstmulti$gen_studies))) +
  
  xlab("")

Similar, but alluvial to show data colloring the flow by country

# format data as needed
foralluvial_2<-ind3_data_firstmulti %>% group_by(country_assessment, temp_gen_monitoring, gen_studies) %>%
             summarise(n=n())

# define colors
my_cols<- gg_color_hue(length(unique(foralluvial_2$country_assessment)))
# we need a vector of colors by country for each row of the dataset, so:
countries<-as.factor(foralluvial_2$country_assessment)
levels(countries)<-my_cols
countries<-as.vector(countries)

# plot 
alluvial(foralluvial_2[,1:3], freq = foralluvial_2$n,
         col=countries,  
         blocks=FALSE,
         gap.width = 0.5,
         cex=.8, 
         xw = 0.1,
         cw = 0.2,
         border = NA)

Session Info for reproducibility purposes:

sessionInfo()
## R version 4.2.1 (2022-06-23)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur ... 10.16
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] cowplot_1.1.1      viridis_0.6.3      viridisLite_0.4.0  alluvial_0.1-2    
##  [5] ggsankey_0.0.99999 ggplot2_3.4.1      stringr_1.4.0      utile.tools_0.2.7 
##  [9] dplyr_1.0.9        tidyr_1.2.0       
## 
## loaded via a namespace (and not attached):
##  [1] highr_0.9        pillar_1.7.0     bslib_0.3.1      compiler_4.2.1  
##  [5] jquerylib_0.1.4  tools_4.2.1      digest_0.6.29    gtable_0.3.0    
##  [9] jsonlite_1.8.0   evaluate_0.15    lifecycle_1.0.3  tibble_3.1.7    
## [13] pkgconfig_2.0.3  rlang_1.0.6      cli_3.6.0        DBI_1.1.3       
## [17] rstudioapi_0.13  yaml_2.3.5       xfun_0.31        fastmap_1.1.0   
## [21] gridExtra_2.3    withr_2.5.0      knitr_1.39       generics_0.1.3  
## [25] vctrs_0.5.2      sass_0.4.1       grid_4.2.1       tidyselect_1.1.2
## [29] glue_1.6.2       R6_2.5.1         fansi_1.0.3      rmarkdown_2.14  
## [33] farver_2.1.1     purrr_0.3.4      magrittr_2.0.3   scales_1.2.0    
## [37] ellipsis_0.3.2   htmltools_0.5.5  assertthat_0.2.1 colorspace_2.0-3
## [41] labeling_0.4.2   utf8_1.2.2       stringi_1.7.6    munsell_0.5.0   
## [45] crayon_1.5.1